mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-12-26 13:23:25 +03:00
Merge branch 'master' of github.com:moses-smt/mosesdecoder
This commit is contained in:
commit
18e8f12d5e
3
.gitmodules
vendored
3
.gitmodules
vendored
@ -0,0 +1,3 @@
|
||||
[submodule "contrib/arrow-pipelines/python/libs/pypeline"]
|
||||
path = contrib/arrow-pipelines/python/libs/pypeline
|
||||
url = git://github.com/ianj-als/pypeline.git
|
@ -45,7 +45,7 @@ ADVICE ON INSTALLING EXTERNAL LIBRARIES
|
||||
Generally, for trouble installing external libraries, you should get support
|
||||
directly from the library maker:
|
||||
|
||||
Boost: http://www.boost.org/doc/libs/1_48_0/more/getting_started/unix-variants.html
|
||||
Boost: http://www.boost.org/doc/libs/release/more/getting_started/unix-variants.html
|
||||
IRSTLM: https://list.fbk.eu/sympa/subscribe/user-irstlm
|
||||
SRILM: http://www.speech.sri.com/projects/srilm/#srilm-user
|
||||
|
||||
|
2
NOTICE
2
NOTICE
@ -1,3 +1,5 @@
|
||||
This code includes data from Daniel Naber's Language Tools (czech abbreviations).
|
||||
|
||||
This code includes data from czech wiktionary (also czech abbreviations).
|
||||
|
||||
|
||||
|
Binary file not shown.
32
contrib/arrow-pipelines/python/README
Normal file
32
contrib/arrow-pipelines/python/README
Normal file
@ -0,0 +1,32 @@
|
||||
Arrow Based Moses Training Pipeline
|
||||
===================================
|
||||
|
||||
To use the demonstration you must first initialise the git submodules for this clone. Return to the top level directory and issue the following command:
|
||||
|
||||
$ git submodule init
|
||||
|
||||
This will clone the Pypeline submodule that is available on GitHub (https://github.com/ianj-als/pypeline). To install Pypeline:
|
||||
|
||||
$ cd libs/pypeline
|
||||
$ python setup.py install
|
||||
|
||||
Alternatively, you can set an appropriate PYTHONPATH enviornment variable to the Pypeline library.
|
||||
|
||||
This demonstration implements a training pipeline that is shown in the Dia diagram in ../documentation/training-pipeline/moses-pypeline.dia.
|
||||
|
||||
Three environment variables need to be set before the manager.py script can be run, they are:
|
||||
|
||||
- MOSES_HOME : The directory where Moses has been cloned, or installed,
|
||||
- IRSTLM : The installation directory of your IRSTLM, and
|
||||
- GIZA_HOME : The installation directory of GIZA++.
|
||||
|
||||
The manager.py script takes four positional command-line arguments:
|
||||
|
||||
- The source language code,
|
||||
- The target language code,
|
||||
- The source corpus file. This file *must* be cleaned prior to use, and
|
||||
- The target corpus file. This file *must* be cleaned prior to use.
|
||||
|
||||
For example, run the manager.py script with:
|
||||
|
||||
$ python manager.py en lt cleantrain.en cleantrain.lt
|
1
contrib/arrow-pipelines/python/libs/pypeline
Submodule
1
contrib/arrow-pipelines/python/libs/pypeline
Submodule
@ -0,0 +1 @@
|
||||
Subproject commit a7084b686f5196f1bbac5d389b4a6cd7f15c83fb
|
192
contrib/arrow-pipelines/python/manager.py
Normal file
192
contrib/arrow-pipelines/python/manager.py
Normal file
@ -0,0 +1,192 @@
|
||||
import logging
|
||||
import os
|
||||
|
||||
from concurrent.futures import Future, ThreadPoolExecutor
|
||||
from functools import partial
|
||||
from pypeline.helpers.parallel_helpers import eval_pipeline, \
|
||||
cons_function_component, \
|
||||
cons_wire, \
|
||||
cons_split_wire, \
|
||||
cons_unsplit_wire, \
|
||||
cons_dictionary_wire
|
||||
|
||||
|
||||
#
|
||||
# Some logging please
|
||||
#
|
||||
FORMAT = '%(asctime)-15s : %(threadName)s : %(levelname)s - %(message)s'
|
||||
logging.basicConfig(format = FORMAT, level = logging.DEBUG)
|
||||
logger = logging.getLogger("manager")
|
||||
|
||||
|
||||
# Build the pipeline components
|
||||
def build_components(components, configuration, executor):
|
||||
pipeline_components = dict()
|
||||
pipeline_configuration = dict()
|
||||
|
||||
for component_id, module_name in components.items():
|
||||
logger.info("Loading [%s] component from [%s]..." % (component_id, module_name))
|
||||
|
||||
module = __import__(module_name, fromlist = ['configure', 'initialise'])
|
||||
|
||||
# Component builds its own configuration object
|
||||
config_func = getattr(module, 'configure')
|
||||
component_config = config_func(configuration)
|
||||
pipeline_configuration.update(component_config)
|
||||
|
||||
# Now build the component
|
||||
init_func = getattr(module, 'initialise')
|
||||
component_function = init_func(component_config)
|
||||
|
||||
# A wrapper for the component's function that submits to the executor
|
||||
def get_component_function_wrapper(inner_function, comp_id, mod_name):
|
||||
def component_function_wrapper(a, s):
|
||||
logger.info("Running component [%s], from module [%s], with value [%s] and state [%s]..." % \
|
||||
(comp_id, mod_name, a, s))
|
||||
return inner_function(a, s)
|
||||
|
||||
return component_function_wrapper
|
||||
|
||||
# Arrowize the component
|
||||
component = cons_function_component(get_component_function_wrapper(component_function, component_id, module_name))
|
||||
|
||||
# And store
|
||||
pipeline_components[component_id] = component
|
||||
|
||||
return pipeline_components, pipeline_configuration
|
||||
|
||||
|
||||
# Go!
|
||||
def main(src_lang, trg_lang, src_filename, trg_filename):
|
||||
# Global configuration
|
||||
# One day, this configuration shall be constructed from
|
||||
# command line options, or a properties file.
|
||||
configuration = {
|
||||
'moses_installation_dir': os.environ['MOSES_HOME'],
|
||||
'irstlm_installation_dir': os.environ['IRSTLM'],
|
||||
'giza_installation_dir': os.environ['GIZA_HOME'],
|
||||
'src_lang': src_lang,
|
||||
'src_tokenisation_dir': './tokenisation',
|
||||
'trg_lang': trg_lang,
|
||||
'trg_tokenisation_dir': './tokenisation',
|
||||
'segment_length_limit': 60,
|
||||
'irstlm_smoothing_method': 'improved-kneser-ney',
|
||||
'language_model_directory': './language-model',
|
||||
'translation_model_directory': './translation-model',
|
||||
'mert_working_directory': './mert',
|
||||
'evaluation_data_size': 100,
|
||||
'development_data_size': 100
|
||||
}
|
||||
|
||||
# The modules to load
|
||||
# In the future, the components shall be specified in some kind
|
||||
# pipeline description file.
|
||||
component_modules = {
|
||||
'src_tokenizer': 'training.components.tokenizer.src_tokenizer',
|
||||
'trg_tokenizer': 'training.components.tokenizer.trg_tokenizer',
|
||||
'cleanup': 'training.components.cleanup.cleanup',
|
||||
'data_split': 'training.components.data_split.data_split',
|
||||
'irstlm_build': 'training.components.irstlm_build.irstlm_build',
|
||||
'model_training': 'training.components.model_training.model_training',
|
||||
'mert': 'training.components.mert.mert'
|
||||
}
|
||||
|
||||
# The thread pool
|
||||
executor = ThreadPoolExecutor(max_workers = 3)
|
||||
|
||||
# Phew, build the required components
|
||||
components, component_config = build_components(component_modules, configuration, executor)
|
||||
|
||||
#
|
||||
# Wire up components
|
||||
# Description of wiring should be, in the future, alongside the component
|
||||
# specification in some kind of confuguration file. Components shall be
|
||||
# declared then used, i.e., bind a component instance to a unique component
|
||||
# identifier, then wire component instances together by identifier.
|
||||
#
|
||||
|
||||
#
|
||||
# Tokenisation of source and target...
|
||||
#
|
||||
# IRSTLM Build components
|
||||
irstlm_build_component = cons_split_wire() >> \
|
||||
(cons_wire(lambda a, s: {'input_filename': a['tokenised_trg_filename']}) >> \
|
||||
components['irstlm_build']).second() >> \
|
||||
cons_unsplit_wire(lambda t, b: {'tokenised_trg_filename': t['tokenised_trg_filename'],
|
||||
'trg_language_model_filename': b['compiled_lm_filename']})
|
||||
|
||||
# The complete tokenisation component
|
||||
tokenisation_component = (components['src_tokenizer'] & components['trg_tokenizer']) >> \
|
||||
irstlm_build_component.second() >> \
|
||||
cons_unsplit_wire(lambda t, b: {'src_filename': t['tokenised_src_filename'],
|
||||
'trg_filename': b['tokenised_trg_filename'],
|
||||
'trg_language_model_filename': b['trg_language_model_filename']})
|
||||
|
||||
#
|
||||
# Cleanup and Data Spliting...
|
||||
#
|
||||
|
||||
#
|
||||
# A function that clips off the last '.' delimited string
|
||||
#
|
||||
def clip_last_bit(filename):
|
||||
bn = os.path.basename(filename)
|
||||
directory = os.path.dirname(filename)
|
||||
bits = bn.split(".")
|
||||
bits.pop()
|
||||
return os.path.join(directory, ".".join(bits))
|
||||
|
||||
cleanup_datasplit_component = components['cleanup'] >> \
|
||||
cons_wire(lambda a, s: {'src_filename': a['cleaned_src_filename'],
|
||||
'trg_filename': a['cleaned_trg_filename']}) >> \
|
||||
components['data_split'] >> \
|
||||
cons_wire(lambda a, s: {'training_data_filename': clip_last_bit(a['train_src_filename']),
|
||||
'eval_src_filename': a['eval_src_filename'],
|
||||
'eval_trg_filename': a['eval_trg_filename']})
|
||||
|
||||
#
|
||||
# Translation model training
|
||||
#
|
||||
translation_model_component = cons_split_wire() >> \
|
||||
components['model_training'].first() >> \
|
||||
cons_unsplit_wire(lambda t, b: {'moses_ini_file': t['moses_ini_file'],
|
||||
'development_data_filename': b['eval_src_filename']})
|
||||
|
||||
#
|
||||
# The whole pipeline
|
||||
#
|
||||
pipeline = tokenisation_component >> \
|
||||
cons_split_wire() >> \
|
||||
(cleanup_datasplit_component >> translation_model_component).first() >> \
|
||||
cons_unsplit_wire(lambda t, b: {'moses_ini_file': t['moses_ini_file'],
|
||||
'development_data_filename': clip_last_bit(t['development_data_filename']),
|
||||
'trg_language_model_filename': b['trg_language_model_filename'],
|
||||
'trg_language_model_order': 3,
|
||||
'trg_language_model_type': 9}) >> \
|
||||
components['mert']
|
||||
|
||||
|
||||
#
|
||||
# The input to the pipeline
|
||||
#
|
||||
value = {'src_filename': src_filename,
|
||||
'trg_filename': trg_filename}
|
||||
|
||||
#
|
||||
# Evaluate the pipeline
|
||||
#
|
||||
logger.info("Evaluating pipeline with input [%s]..." % value)
|
||||
new_value = eval_pipeline(executor, pipeline, value, component_config)
|
||||
|
||||
#
|
||||
# Wait for all components to finish
|
||||
#
|
||||
executor.shutdown(True)
|
||||
|
||||
logger.info("Pipeline evaluated to %s" % new_value)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
import sys
|
||||
|
||||
main(sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4])
|
0
contrib/arrow-pipelines/python/test/__init__.py
Normal file
0
contrib/arrow-pipelines/python/test/__init__.py
Normal file
11
contrib/arrow-pipelines/python/test/test.py
Normal file
11
contrib/arrow-pipelines/python/test/test.py
Normal file
@ -0,0 +1,11 @@
|
||||
import subprocess
|
||||
|
||||
def cat(filename, content):
|
||||
fh = open(filename, "w")
|
||||
for line in content:
|
||||
#print(line, file=fh)
|
||||
print >> fh, line
|
||||
fh.close()
|
||||
|
||||
def diff(filename1, filename2):
|
||||
subprocess.check_output(["diff", filename1, filename2], stderr=subprocess.STDOUT)
|
0
contrib/arrow-pipelines/python/training/__init__.py
Normal file
0
contrib/arrow-pipelines/python/training/__init__.py
Normal file
@ -0,0 +1,125 @@
|
||||
from pypeline.helpers.helpers import cons_function_component
|
||||
|
||||
def configure(args):
|
||||
result = {}
|
||||
result['segment_length'] = args['segment_length_limit']
|
||||
return result
|
||||
|
||||
def initialise(config):
|
||||
def _filter(limit, ifh1, ofh1, ifh2, ofh2):
|
||||
def _short(line):
|
||||
n = 0
|
||||
for c in line:
|
||||
if c == " ":
|
||||
n += 1
|
||||
#print(line, ":", n)
|
||||
return n < limit
|
||||
|
||||
for (l1, l2) in zip(ifh1, ifh2):
|
||||
if _short(l1) and _short(l2):
|
||||
print >>ofh1, l1,
|
||||
print >>ofh2, l2,
|
||||
|
||||
def _make_cleaned_filename(filename):
|
||||
bits = filename.split(".")
|
||||
bits[-1] = "clean"
|
||||
return ".".join(bits)
|
||||
|
||||
def _filter_main(value, config):
|
||||
limit = config['segment_length']
|
||||
(ifh1, ifh2, ofh1, ofh2) = (None, None, None, None)
|
||||
try:
|
||||
input_src_filename = value['src_filename']
|
||||
input_trg_filename = value['trg_filename']
|
||||
|
||||
print "Cleanup: Cleaning [%s] and [%s]..." % (input_src_filename, input_trg_filename)
|
||||
|
||||
ifh1 = open(input_src_filename, "r")
|
||||
ifh2 = open(input_trg_filename, "r")
|
||||
|
||||
cleaned_src_filename = _make_cleaned_filename(input_src_filename)
|
||||
cleaned_trg_filename = _make_cleaned_filename(input_trg_filename)
|
||||
ofh1 = open(cleaned_src_filename, "w")
|
||||
ofh2 = open(cleaned_trg_filename, "w")
|
||||
|
||||
_filter(limit, ifh1, ofh1, ifh2, ofh2)
|
||||
|
||||
return {'cleaned_src_filename': cleaned_src_filename,
|
||||
'cleaned_trg_filename': cleaned_trg_filename}
|
||||
finally:
|
||||
def _safe_close(fh):
|
||||
if fh is not None:
|
||||
fh.close()
|
||||
_safe_close(ifh1)
|
||||
_safe_close(ifh2)
|
||||
_safe_close(ofh1)
|
||||
_safe_close(ofh2)
|
||||
|
||||
return _filter_main
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
import os
|
||||
import tempfile
|
||||
import test.test as thelp
|
||||
|
||||
from pypeline.helpers.helpers import eval_pipeline
|
||||
|
||||
|
||||
def _test_main():
|
||||
configuration = {'segment_length_limit': 20}
|
||||
|
||||
src_filename = tempfile.mkstemp(suffix = ".src", dir = "/tmp")
|
||||
trg_filename = tempfile.mkstemp(suffix = ".trg", dir = "/tmp")
|
||||
|
||||
box_eval = {
|
||||
'src_filename': src_filename[1],
|
||||
'trg_filename': trg_filename[1],
|
||||
'cleaned_src_file_expected': src_filename[1] + ".expected",
|
||||
'cleaned_trg_file_expected': trg_filename[1] + ".expected"
|
||||
}
|
||||
|
||||
try:
|
||||
_prep_files(box_eval)
|
||||
_run_test(configuration, box_eval)
|
||||
finally:
|
||||
_cleanup_files(box_eval)
|
||||
|
||||
|
||||
def _run_test(configuration, box_eval):
|
||||
box_config = configure(configuration)
|
||||
box = initialise(box_config)
|
||||
|
||||
output = eval_pipeline(box, box_eval, box_config)
|
||||
try:
|
||||
thelp.diff(box_eval['cleaned_src_file_expected'], output['cleaned_src_filename'])
|
||||
thelp.diff(box_eval['cleaned_trg_file_expected'], output['cleaned_trg_filename'])
|
||||
finally:
|
||||
os.unlink(output['cleaned_src_filename'])
|
||||
os.unlink(output['cleaned_trg_filename'])
|
||||
|
||||
|
||||
def _line(line_lengths):
|
||||
def _gen_line(tokens):
|
||||
return " ".join(map(lambda n: "tok" + str(n), range(tokens)))
|
||||
return map(_gen_line, line_lengths)
|
||||
|
||||
|
||||
def _prep_files(box_eval):
|
||||
thelp.cat(box_eval['src_filename'], _line([10, 20, 30, 40, 17, 21]))
|
||||
thelp.cat(box_eval['trg_filename'], _line([40, 30, 20, 10, 20, 21]))
|
||||
#expected output:
|
||||
thelp.cat(box_eval['cleaned_src_file_expected'], _line([17]))
|
||||
thelp.cat(box_eval['cleaned_trg_file_expected'], _line([20]))
|
||||
|
||||
|
||||
def _cleanup_files(box_eval):
|
||||
try:
|
||||
for key, filename in box_eval.items():
|
||||
os.unlink(filename)
|
||||
except:
|
||||
pass
|
||||
|
||||
|
||||
_test_main()
|
||||
|
@ -0,0 +1,109 @@
|
||||
from pypeline.helpers.helpers import cons_function_component
|
||||
|
||||
def configure(args):
|
||||
result = {}
|
||||
result['segment_length'] = args['segment_length_limit']
|
||||
return result
|
||||
|
||||
def initialise(config):
|
||||
def _filter(limit, ifh1, ofh1, ifh2, ofh2):
|
||||
def _short(line):
|
||||
n = 0
|
||||
for c in line:
|
||||
if c == " ":
|
||||
n += 1
|
||||
#print(line, ":", n)
|
||||
return n < limit
|
||||
|
||||
for (l1, l2) in zip(ifh1, ifh2):
|
||||
if _short(l1) and _short(l2):
|
||||
print(l1, end='', file=ofh1)
|
||||
print(l2, end='', file=ofh2)
|
||||
|
||||
def _filter_main(config, value):
|
||||
limit = config['segment_length']
|
||||
(ifh1, ifh2, ofh1, ofh2) = (None, None, None, None)
|
||||
try:
|
||||
ifh1 = open(value['src_filename'], "r")
|
||||
ifh2 = open(value['trg_filename'], "r")
|
||||
ofh1 = open(value['cleaned_src_filename'], "w")
|
||||
ofh2 = open(value['cleaned_trg_filename'], "w")
|
||||
|
||||
_filter(limit, ifh1, ofh1, ifh2, ofh2)
|
||||
|
||||
return {'cleaned_src_filename': value['cleaned_src_filename'],
|
||||
'cleaned_trg_filename': value['cleaned_trg_filename']}
|
||||
finally:
|
||||
def _safe_close(fh):
|
||||
if fh is not None:
|
||||
fh.close()
|
||||
_safe_close(ifh1)
|
||||
_safe_close(ifh2)
|
||||
_safe_close(ofh1)
|
||||
_safe_close(ofh2)
|
||||
|
||||
return cons_function_component(_filter_main)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
import os
|
||||
import tempfile
|
||||
import training.components.shared.test as thelp
|
||||
|
||||
|
||||
def _test_main():
|
||||
configuration = {'segment_length_limit': 20}
|
||||
|
||||
src_filename = tempfile.mkstemp(suffix = "src", dir = "/tmp")
|
||||
trg_filename = tempfile.mkstemp(suffix = "trg", dir = "/tmp")
|
||||
|
||||
box_eval = {
|
||||
'src_filename': src_filename[1],
|
||||
'trg_filename': trg_filename[1],
|
||||
'cleaned_src_filename': src_filename[1] + ".clean",
|
||||
'cleaned_trg_filename': trg_filename[1] + ".clean",
|
||||
'cleaned_src_file_expected': src_filename[1] + ".expected",
|
||||
'cleaned_trg_file_expected': trg_filename[1] + ".expected"
|
||||
}
|
||||
|
||||
try:
|
||||
_prep_files(box_eval)
|
||||
_run_test(configuration, box_eval)
|
||||
finally:
|
||||
_cleanup_files(box_eval)
|
||||
|
||||
|
||||
def _run_test(configuration, box_eval):
|
||||
from pypeline.helpers.helpers import run_pipeline
|
||||
box_config = configure(configuration)
|
||||
box = initialise(box_config)
|
||||
|
||||
run_pipeline(box, box_config, box_eval)
|
||||
thelp.diff(box_eval['cleaned_src_file_expected'], box_eval['cleaned_src_filename'])
|
||||
thelp.diff(box_eval['cleaned_trg_file_expected'], box_eval['cleaned_trg_filename'])
|
||||
|
||||
|
||||
def _line(line_lengths):
|
||||
def _gen_line(tokens):
|
||||
return " ".join(map(lambda n: "tok" + str(n), range(tokens)))
|
||||
return map(_gen_line, line_lengths)
|
||||
|
||||
|
||||
def _prep_files(box_eval):
|
||||
thelp.cat(box_eval['src_filename'], _line([10, 20, 30, 40, 17, 21]))
|
||||
thelp.cat(box_eval['trg_filename'], _line([40, 30, 20, 10, 20, 21]))
|
||||
#expected output:
|
||||
thelp.cat(box_eval['cleaned_src_file_expected'], _line([17]))
|
||||
thelp.cat(box_eval['cleaned_trg_file_expected'], _line([20]))
|
||||
|
||||
|
||||
def _cleanup_files(box_eval):
|
||||
try:
|
||||
for key, filename in box_eval.items():
|
||||
os.unlink(filename)
|
||||
except:
|
||||
pass
|
||||
|
||||
|
||||
_test_main()
|
||||
|
@ -0,0 +1,146 @@
|
||||
from pypeline.helpers.helpers import cons_function_component
|
||||
|
||||
def configure(args):
|
||||
result = {}
|
||||
result['evaluate_size'] = args['evaluation_data_size']
|
||||
result['development_size'] = args['development_data_size']
|
||||
return result
|
||||
|
||||
def initialise(config):
|
||||
|
||||
def _copy(size, inp, ofh1, ofh2):
|
||||
try:
|
||||
while size != 0:
|
||||
(l1, l2) = inp.next()
|
||||
print >>ofh1, l1,
|
||||
print >>ofh2, l2,
|
||||
size -= 1
|
||||
except StopIteration:
|
||||
pass
|
||||
|
||||
def _make_split_filename(filename, data_set):
|
||||
bits = filename.split(".")
|
||||
last = bits.pop()
|
||||
lang_code = bits.pop()
|
||||
|
||||
bits.append(last)
|
||||
bits.append(data_set)
|
||||
bits.append(lang_code)
|
||||
|
||||
new_filename = ".".join(bits)
|
||||
return new_filename
|
||||
|
||||
def _splitter_main(value, config):
|
||||
(ifh1, ifh2, ofh1, ofh2) = (None, None, None, None)
|
||||
try:
|
||||
input_src_filename = value['src_filename']
|
||||
input_trg_filename = value['trg_filename']
|
||||
|
||||
ifh1 = open(input_src_filename, "r")
|
||||
ifh2 = open(input_trg_filename, "r")
|
||||
inp = iter(zip(ifh1, ifh2))
|
||||
|
||||
result = {}
|
||||
for (data_set, size) in [
|
||||
('devel', config['development_size']),
|
||||
('eval', config['evaluate_size']),
|
||||
('train', -1)
|
||||
]:
|
||||
output_src_filename = _make_split_filename(input_src_filename, data_set)
|
||||
output_trg_filename = _make_split_filename(input_trg_filename, data_set)
|
||||
ofh1 = open(output_src_filename, "w")
|
||||
ofh2 = open(output_trg_filename, "w")
|
||||
|
||||
_copy(size, inp, ofh1, ofh2)
|
||||
result[data_set + '_src_filename'] = output_src_filename
|
||||
result[data_set + '_trg_filename'] = output_trg_filename
|
||||
|
||||
return result
|
||||
|
||||
finally:
|
||||
def _safe_close(fh):
|
||||
if fh is not None:
|
||||
fh.close()
|
||||
_safe_close(ifh1)
|
||||
_safe_close(ifh2)
|
||||
_safe_close(ofh1)
|
||||
_safe_close(ofh2)
|
||||
|
||||
return _splitter_main
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
import os
|
||||
import tempfile
|
||||
import test.test as thelp
|
||||
|
||||
from pypeline.helpers.helpers import eval_pipeline
|
||||
|
||||
|
||||
def _test_main():
|
||||
configuration = {
|
||||
'evaluation_data_size': 7,
|
||||
'development_data_size': 13,
|
||||
}
|
||||
|
||||
src_filename = tempfile.mkstemp(suffix = ".src", dir = "/tmp")
|
||||
trg_filename = tempfile.mkstemp(suffix = ".trg", dir = "/tmp")
|
||||
|
||||
box_eval = {
|
||||
'src_filename': src_filename[1],
|
||||
'trg_filename': trg_filename[1],
|
||||
'devel_src_expected': src_filename[1] + ".devel.expected",
|
||||
'devel_trg_expected': trg_filename[1] + ".devel.expected",
|
||||
'eval_src_expected': src_filename[1] + ".eval.expected",
|
||||
'eval_trg_expected': trg_filename[1] + ".eval.expected",
|
||||
'train_src_expected': src_filename[1] + ".train.expected",
|
||||
'train_trg_expected': trg_filename[1] + ".train.expected",
|
||||
}
|
||||
|
||||
try:
|
||||
_prep_files(box_eval)
|
||||
_run_test(configuration, box_eval)
|
||||
finally:
|
||||
_cleanup_files(box_eval)
|
||||
|
||||
|
||||
def _run_test(configuration, box_eval):
|
||||
box_config = configure(configuration)
|
||||
box = initialise(box_config)
|
||||
|
||||
output = eval_pipeline(box, box_eval, box_config)
|
||||
for data_set in ['devel', 'eval', 'train']:
|
||||
for lang in ['src', 'trg']:
|
||||
filename = output[data_set + '_' + lang + '_filename']
|
||||
filename_expected = box_eval[data_set + '_' + lang + '_expected']
|
||||
thelp.diff(filename_expected, filename)
|
||||
|
||||
|
||||
def _line(line_lengths):
|
||||
def _gen_line(tokens):
|
||||
return " ".join(map(lambda n: "tok" + str(n), range(tokens)))
|
||||
return map(_gen_line, line_lengths)
|
||||
|
||||
|
||||
def _prep_files(box_eval):
|
||||
thelp.cat(box_eval['src_filename'], _line(range(50)))
|
||||
thelp.cat(box_eval['trg_filename'], _line(range(50)))
|
||||
#expected output:
|
||||
thelp.cat(box_eval['devel_src_expected'], _line(range(0,13)))
|
||||
thelp.cat(box_eval['devel_trg_expected'], _line(range(0,13)))
|
||||
thelp.cat(box_eval['eval_src_expected'], _line(range(13,20)))
|
||||
thelp.cat(box_eval['eval_trg_expected'], _line(range(13,20)))
|
||||
thelp.cat(box_eval['train_src_expected'], _line(range(20,50)))
|
||||
thelp.cat(box_eval['train_trg_expected'], _line(range(20,50)))
|
||||
|
||||
|
||||
def _cleanup_files(box_eval):
|
||||
try:
|
||||
for key, filename in box_eval.items():
|
||||
os.unlink(filename)
|
||||
except:
|
||||
pass
|
||||
|
||||
|
||||
_test_main()
|
||||
|
@ -0,0 +1,106 @@
|
||||
import os
|
||||
import shutil
|
||||
import subprocess
|
||||
import tempfile
|
||||
|
||||
from pypeline.helpers.helpers import cons_function_component
|
||||
|
||||
def configure(args):
|
||||
config = dict()
|
||||
config['irstlm_install_directory'] = args['irstlm_installation_dir']
|
||||
config['smoothing_method'] = args['irstlm_smoothing_method']
|
||||
config['lm_directory'] = args['language_model_directory']
|
||||
return config
|
||||
|
||||
def initialise(config):
|
||||
def process(a, s):
|
||||
# Create the LM directory if we need to
|
||||
if os.path.exists(s['lm_directory']) is False:
|
||||
os.makedirs(s['lm_directory'])
|
||||
|
||||
# The filename of the file to chew through
|
||||
start_end_input_filename = a['input_filename']
|
||||
if os.path.exists(start_end_input_filename) is False:
|
||||
raise Exception("IRSTLM Build: Input file could not be found at [%s]" % start_end_input_filename)
|
||||
|
||||
# Derive the output file name for the add start-end marker processor
|
||||
filename_bits = os.path.basename(start_end_input_filename).split(".")
|
||||
filename_bits[2] = "sb";
|
||||
start_end_output_filename = os.path.join(s['lm_directory'], ".".join(filename_bits))
|
||||
|
||||
# Derive the output file name of the LM build
|
||||
filename_bits[2] = "lm"
|
||||
lm_filename = os.path.join(s['lm_directory'], ".".join(filename_bits))
|
||||
|
||||
# Derive the compiled LM file name
|
||||
filename_bits[2] = "arpa"
|
||||
compiled_lm_filename = os.path.join(s['lm_directory'], ".".join(filename_bits))
|
||||
|
||||
# First thing to do is add start and end markers
|
||||
start_end_cmdline = [os.path.join(s['irstlm_install_directory'], "bin", "add-start-end.sh")]
|
||||
infile = open(start_end_input_filename, 'r')
|
||||
outfile = open(start_end_output_filename, 'w')
|
||||
print "IRSTLM Build: Invoking [%s]..." % " ".join(start_end_cmdline)
|
||||
return_code = subprocess.check_call(start_end_cmdline, stdin = infile, stdout = outfile)
|
||||
if return_code:
|
||||
raise Exception("IRSTLM add start and end markers failed: input file = [%s], output file = [%s], return code = [%d]" % \
|
||||
start_end_input_filename, start_end_output_filename, return_code)
|
||||
|
||||
# Next build the language model
|
||||
tmp_dir = tempfile.mkdtemp(dir = "/tmp")
|
||||
try:
|
||||
build_lm_cmdline = [os.path.join(s['irstlm_install_directory'], "bin", "build-lm.sh"),
|
||||
"-i", start_end_output_filename,
|
||||
"-t", tmp_dir,
|
||||
"-p",
|
||||
"-s", s['smoothing_method'],
|
||||
"-o", lm_filename]
|
||||
print "IRSTLM Build: Invoking [%s]..." % " ".join(build_lm_cmdline)
|
||||
return_code = subprocess.check_call(build_lm_cmdline)
|
||||
if return_code:
|
||||
raise Exception("IRST language model failed to build: return code = [%d]" % return_code)
|
||||
finally:
|
||||
if os.path.exists(tmp_dir):
|
||||
shutil.rmtree(tmp_dir)
|
||||
|
||||
# Compile the LM
|
||||
lm_filename = lm_filename + ".gz"
|
||||
compile_lm_cmdline = [os.path.join(s['irstlm_install_directory'], "bin", "compile-lm"),
|
||||
"--text", "yes",
|
||||
lm_filename,
|
||||
compiled_lm_filename]
|
||||
print "IRSTLM Build: Invoking [%s]..." % " ".join(compile_lm_cmdline)
|
||||
return_code = subprocess.check_call(compile_lm_cmdline)
|
||||
if return_code:
|
||||
raise Exception("IRST language model compilation failed: return code = [%d]" % return_code)
|
||||
|
||||
output = {'add_start_end_filename': start_end_output_filename,
|
||||
'lm_filename': lm_filename,
|
||||
'compiled_lm_filename': compiled_lm_filename}
|
||||
|
||||
print "IRSTLM Build: Output = %s" % output
|
||||
|
||||
return output
|
||||
|
||||
return process
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
from pypeline.helpers.helpers import eval_pipeline
|
||||
|
||||
lm_dir = os.environ["PWD"]
|
||||
configuration = {'irstlm_root': os.environ["IRSTLM"],
|
||||
'irstlm_smoothing_method': 'improved-kneser-ney',
|
||||
'language_model_directory': lm_dir}
|
||||
component_config = configure(configuration)
|
||||
component = initialise(component_config)
|
||||
|
||||
value = eval_pipeline(component,
|
||||
{'input_filename': '/Users/ianjohnson/Dropbox/Documents/MTM2012/tokenised_files/news-commentary-v7.fr-en.tok.en'},
|
||||
component_config)
|
||||
target = {'add_start_end_filename': os.path.join(lm_dir, 'news-commentary-v7.fr-en.sb.en'),
|
||||
'lm_filename': os.path.join(lm_dir, 'news-commentary-v7.fr-en.lm.en.gz'),
|
||||
'compiled_lm_filename': os.path.join(lm_dir, 'news-commentary-v7.fr-en.arpa.en')}
|
||||
print "Target: %s" % target
|
||||
if value != target:
|
||||
raise Exception("Massive fail!")
|
83
contrib/arrow-pipelines/python/training/components/mert/mert.py
Executable file
83
contrib/arrow-pipelines/python/training/components/mert/mert.py
Executable file
@ -0,0 +1,83 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
import os, shutil, subprocess
|
||||
|
||||
from pypeline.helpers.helpers import cons_function_component
|
||||
|
||||
def configure(args):
|
||||
result = {}
|
||||
result['src_lang'] = args['src_lang']
|
||||
result['trg_lang'] = args['trg_lang']
|
||||
result['moses_installation_dir'] = args['moses_installation_dir']
|
||||
result['mert_working_dir'] = args['mert_working_directory']
|
||||
return result
|
||||
|
||||
def initialise(config):
|
||||
|
||||
def process(a, s):
|
||||
infilename = os.path.abspath(a['development_data_filename'])
|
||||
lm_file = os.path.abspath(a['trg_language_model_filename'])
|
||||
lm_order = int(a['trg_language_model_order'])
|
||||
lm_type = int(a['trg_language_model_type'])
|
||||
orig_moses_ini = os.path.abspath(a['moses_ini_file'])
|
||||
|
||||
if not os.path.exists(orig_moses_ini):
|
||||
raise Exception, "Error: Input moses.ini does not exist"
|
||||
|
||||
workdir = os.path.abspath(config['mert_working_dir'])
|
||||
#simply call the training perl script
|
||||
#remove the workdir if it is already there
|
||||
if os.path.exists(workdir):
|
||||
shutil.rmtree(workdir)
|
||||
os.makedirs(workdir)
|
||||
|
||||
#local vars
|
||||
moses_install_dir = os.path.abspath(config['moses_installation_dir'])
|
||||
mert_perl = os.path.join(moses_install_dir, 'scripts', 'training', 'mert-moses.pl')
|
||||
bin_dir = os.path.join(moses_install_dir, 'bin')
|
||||
moses_bin = os.path.join(moses_install_dir, 'bin', 'moses')
|
||||
src_file = infilename + '.' + config['src_lang']
|
||||
ref_file = infilename + '.' + config['trg_lang']
|
||||
logfile = os.path.join(workdir, 'log')
|
||||
#change lm configuration in moses ini
|
||||
moses_ini = os.path.join(workdir, 'trained-moses.ini')
|
||||
cmd = r"cat %(orig_moses_ini)s | sed '/\[lmodel-file\]/,/^[[:space:]]*$/c\[lmodel-file\]\n%(lm_type)s 0 %(lm_order)s %(lm_file)s\n' > %(moses_ini)s"
|
||||
cmd = cmd % locals()
|
||||
os.system(cmd)
|
||||
|
||||
#the command
|
||||
cmd = '%(mert_perl)s --mertdir %(bin_dir)s --working-dir %(workdir)s %(src_file)s %(ref_file)s %(moses_bin)s %(moses_ini)s 2> %(logfile)s'
|
||||
cmd = cmd % locals()
|
||||
|
||||
pipe = subprocess.Popen(cmd, stdin = subprocess.PIPE, stdout = subprocess.PIPE, shell=True)
|
||||
pipe.wait()
|
||||
|
||||
#check the moses ini
|
||||
new_mosesini = os.path.join(workdir, 'moses.ini')
|
||||
if not os.path.exists(new_mosesini):
|
||||
raise Exception, 'Failed MERT'
|
||||
|
||||
return {'moses_ini_file':new_mosesini}
|
||||
|
||||
return process
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
def __test():
|
||||
configuration = {'src_lang':'en',
|
||||
'trg_lang':'lt',
|
||||
'moses_installation_dir':os.path.abspath('../../../../'),
|
||||
'mert_working_dir':'../../../../../tuning'}
|
||||
values = {'development_data_filename':'../../../../../corpus/tune',
|
||||
'moses_ini_file':'../../../../../model/model/moses.ini',
|
||||
'trg_language_model_filename':'../../../../../corpus/train.lt.lm',
|
||||
'trg_language_model_type':9,
|
||||
'trg_language_model_order':4}
|
||||
from pypeline.helpers.helpers import run_pipeline
|
||||
box_config = configure(configuration)
|
||||
box = initialise(configuration)
|
||||
print run_pipeline(box, values, None)
|
||||
|
||||
#do some test
|
||||
__test()
|
||||
|
@ -0,0 +1,72 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
import os, shutil, subprocess
|
||||
|
||||
from pypeline.helpers.helpers import cons_function_component
|
||||
|
||||
def configure(args):
|
||||
result = {}
|
||||
result['src_lang'] = args['src_lang']
|
||||
result['trg_lang'] = args['trg_lang']
|
||||
result['moses_installation_dir'] = args['moses_installation_dir']
|
||||
result['external_bin_dir'] = args['giza_installation_dir']
|
||||
result['model_directory'] = args['translation_model_directory']
|
||||
return result
|
||||
|
||||
def initialise(config):
|
||||
|
||||
def process(a, s):
|
||||
infilename = os.path.abspath(a['training_data_filename'])
|
||||
workdir = os.path.abspath(config['model_directory'])
|
||||
#simply call the training perl script
|
||||
#remove the workdir if it is already there
|
||||
if os.path.exists(workdir):
|
||||
shutil.rmtree(workdir)
|
||||
os.makedirs(workdir)
|
||||
|
||||
#local vars
|
||||
train_model_perl = os.path.abspath(config['moses_installation_dir']) + os.sep + 'scripts' + os.sep + 'training' + os.sep + 'train-model.perl'
|
||||
src_lang = config['src_lang'].lower()
|
||||
trg_lang = config['trg_lang'].lower()
|
||||
external_bin = os.path.abspath(config['external_bin_dir'])
|
||||
#create a dummy lm file
|
||||
dummy_lmfile = workdir + os.sep + 'dummy.lm'
|
||||
f = open(dummy_lmfile, 'w')
|
||||
print >> f, "dummy lm file"
|
||||
f.close()
|
||||
logfile = workdir + os.sep + 'log'
|
||||
|
||||
#the command
|
||||
cmd = '%(train_model_perl)s -root-dir %(workdir)s -corpus %(infilename)s -f %(src_lang)s -e %(trg_lang)s -alignment grow-diag-final-and -reordering msd-bidirectional-fe -lm 0:5:%(dummy_lmfile)s:0 -external-bin-dir %(external_bin)s 2> %(logfile)s'
|
||||
|
||||
cmd = cmd % locals()
|
||||
|
||||
pipe = subprocess.Popen(cmd, stdin = subprocess.PIPE, stdout = subprocess.PIPE, shell=True)
|
||||
pipe.wait()
|
||||
|
||||
#check the moses ini
|
||||
mosesini = workdir + os.sep + 'model' + os.sep + 'moses.ini'
|
||||
if not os.path.exists(mosesini):
|
||||
raise Exception, 'Failed training model'
|
||||
|
||||
return {'moses_ini_file':mosesini}
|
||||
|
||||
return process
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
def __test():
|
||||
configuration = {'src_lang':'en',
|
||||
'trg_lang':'lt',
|
||||
'moses_installation_dir':os.environ['MOSES_HOME'],
|
||||
'giza_installation_dir':os.environ['GIZA_HOME'],
|
||||
'translation_model_directory':'model-dir'}
|
||||
values = {'training_data_filename':'/Users/ianjohnson/work/MTM-2012/corpus/training/cleantrain'}
|
||||
from pypeline.helpers.helpers import run_pipeline
|
||||
box_config = configure(configuration)
|
||||
box = initialise(box_config)
|
||||
print run_pipeline(box, values, None)
|
||||
|
||||
#do some test
|
||||
__test()
|
||||
|
@ -0,0 +1,43 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
import os
|
||||
|
||||
from tokenizer import Tokenizer
|
||||
|
||||
from pypeline.helpers.helpers import cons_function_component
|
||||
|
||||
def configure(args):
|
||||
result = {}
|
||||
result['src_lang'] = args['src_lang']
|
||||
result['src_tokenisation_dir'] = args['src_tokenisation_dir']
|
||||
result['moses_installation_dir'] = args['moses_installation_dir']
|
||||
return result
|
||||
|
||||
def initialise(config):
|
||||
|
||||
def process(a, s):
|
||||
infilename = a['src_filename']
|
||||
outfilename = Tokenizer.batch_tokenise(
|
||||
config['src_lang'],
|
||||
config['moses_installation_dir'],
|
||||
infilename,
|
||||
config['src_tokenisation_dir'])
|
||||
return {'tokenised_src_filename':outfilename}
|
||||
|
||||
return process
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
def __test():
|
||||
configuration = {'src_lang':'de',
|
||||
'src_tokenisation_dir':'tmptok',
|
||||
'moses_installation_dir':os.path.abspath('../../../../')}
|
||||
values = {'src_filename':'tmp.de'}
|
||||
from pypeline.helpers.helpers import run_pipeline
|
||||
box_config = configure(configuration)
|
||||
box = initialise(configuration)
|
||||
print run_pipeline(box, values, None)
|
||||
|
||||
#do some test
|
||||
__test()
|
||||
|
@ -0,0 +1,3 @@
|
||||
asdfweoih
|
||||
awfwoeijf awefo
|
||||
what's this
|
@ -0,0 +1,36 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
import sys, os, subprocess
|
||||
|
||||
class Tokenizer:
|
||||
|
||||
@staticmethod
|
||||
def batch_tokenise(lang, mosesdir, infilename, workdir):
|
||||
print "Tokenizing [%s] in working directory [%s]..." % (infilename, workdir)
|
||||
if not os.path.exists(workdir):
|
||||
os.makedirs(workdir)
|
||||
tok = Tokenizer(lang, mosesdir)
|
||||
basefilename = os.path.basename(infilename)
|
||||
outfilename = workdir + os.sep + basefilename + '.tok'
|
||||
tok.file_tokenise(infilename, outfilename)
|
||||
return outfilename
|
||||
|
||||
def __init__(self, lang, mosesdir):
|
||||
self.arrows = None
|
||||
self.lang = lang
|
||||
#check the perl tokenizer is here
|
||||
#path = os.path.dirname(os.path.abspath(__file__))
|
||||
path = mosesdir + os.sep + 'scripts' + os.sep + 'tokenizer'
|
||||
self.perltok = path + os.sep + 'tokenizer.perl'
|
||||
if not os.path.exists(path):
|
||||
raise Exception, "Perl tokenizer does not exists"
|
||||
|
||||
def file_tokenise(self, infilename, outfilename):
|
||||
cmd = '%s -q -l %s < %s > %s' % (self.perltok, self.lang, infilename, outfilename)
|
||||
pipe = subprocess.Popen(cmd, stdin = subprocess.PIPE, stdout = subprocess.PIPE, shell=True)
|
||||
pipe.wait()
|
||||
|
||||
if __name__ == '__main__':
|
||||
#do some test
|
||||
pass
|
||||
|
@ -0,0 +1,43 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
import os
|
||||
|
||||
from tokenizer import Tokenizer
|
||||
|
||||
from pypeline.helpers.helpers import cons_function_component
|
||||
|
||||
def configure(args):
|
||||
result = {}
|
||||
result['trg_lang'] = args['trg_lang']
|
||||
result['trg_tokenisation_dir'] = args['trg_tokenisation_dir']
|
||||
result['moses_installation_dir'] = args['moses_installation_dir']
|
||||
return result
|
||||
|
||||
def initialise(config):
|
||||
|
||||
def process(a, s):
|
||||
infilename = a['trg_filename']
|
||||
outfilename = Tokenizer.batch_tokenise(
|
||||
config['trg_lang'],
|
||||
config['moses_installation_dir'],
|
||||
infilename,
|
||||
config['trg_tokenisation_dir'])
|
||||
return {'tokenised_trg_filename':outfilename}
|
||||
|
||||
return process
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
def __test():
|
||||
configuration = {'trg_lang':'de',
|
||||
'trg_tokenisation_dir':'tmptoktrg',
|
||||
'moses_installation_dir':os.path.abspath('../../../../')}
|
||||
values = {'trg_filename':'tmp.de'}
|
||||
from pypeline.helpers.helpers import run_pipeline
|
||||
box_config = configure(configuration)
|
||||
box = initialise(configuration)
|
||||
print run_pipeline(box, values, None)
|
||||
|
||||
#do some test
|
||||
__test()
|
||||
|
@ -24,7 +24,7 @@
|
||||
<folderInfo id="cdt.managedbuild.config.gnu.macosx.exe.debug.846397978." name="/" resourcePath="">
|
||||
<toolChain id="cdt.managedbuild.toolchain.gnu.macosx.exe.debug.725420545" name="MacOSX GCC" superClass="cdt.managedbuild.toolchain.gnu.macosx.exe.debug">
|
||||
<targetPlatform binaryParser="org.eclipse.cdt.core.MachO64;org.eclipse.cdt.core.ELF" id="cdt.managedbuild.target.gnu.platform.macosx.exe.debug.1586272140" name="Debug Platform" superClass="cdt.managedbuild.target.gnu.platform.macosx.exe.debug"/>
|
||||
<builder buildPath="${workspace_loc:/OnDiskPt/Debug}" id="cdt.managedbuild.target.gnu.builder.macosx.exe.debug.1909553559" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.target.gnu.builder.macosx.exe.debug"/>
|
||||
<builder buildPath="${workspace_loc:/OnDiskPt/Debug}" id="cdt.managedbuild.target.gnu.builder.macosx.exe.debug.1909553559" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" parallelBuildOn="true" parallelizationNumber="optimal" superClass="cdt.managedbuild.target.gnu.builder.macosx.exe.debug"/>
|
||||
<tool id="cdt.managedbuild.tool.macosx.c.linker.macosx.exe.debug.30521110" name="MacOS X C Linker" superClass="cdt.managedbuild.tool.macosx.c.linker.macosx.exe.debug"/>
|
||||
<tool id="cdt.managedbuild.tool.macosx.cpp.linker.macosx.exe.debug.478334849" name="MacOS X C++ Linker" superClass="cdt.managedbuild.tool.macosx.cpp.linker.macosx.exe.debug">
|
||||
<inputType id="cdt.managedbuild.tool.macosx.cpp.linker.input.1328561226" superClass="cdt.managedbuild.tool.macosx.cpp.linker.input">
|
||||
@ -133,8 +133,13 @@
|
||||
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP"/>
|
||||
</scannerConfigBuildInfo>
|
||||
</storageModule>
|
||||
<storageModule moduleId="refreshScope" versionNumber="1">
|
||||
<storageModule moduleId="refreshScope" versionNumber="2">
|
||||
<configuration configurationName="Release">
|
||||
<resource resourceType="PROJECT" workspacePath="/OnDiskPt"/>
|
||||
</configuration>
|
||||
<configuration configurationName="Debug">
|
||||
<resource resourceType="PROJECT" workspacePath="/OnDiskPt"/>
|
||||
</configuration>
|
||||
</storageModule>
|
||||
<storageModule moduleId="org.eclipse.cdt.make.core.buildtargets"/>
|
||||
<storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/>
|
||||
|
@ -18,11 +18,14 @@
|
||||
<folderInfo id="cdt.managedbuild.config.gnu.exe.debug.1133345948." name="/" resourcePath="">
|
||||
<toolChain id="cdt.managedbuild.toolchain.gnu.exe.debug.1405862229" name="Linux GCC" superClass="cdt.managedbuild.toolchain.gnu.exe.debug">
|
||||
<targetPlatform id="cdt.managedbuild.target.gnu.platform.exe.debug.605722566" name="Debug Platform" superClass="cdt.managedbuild.target.gnu.platform.exe.debug"/>
|
||||
<builder buildPath="${workspace_loc:/extractor/Debug}" id="cdt.managedbuild.target.gnu.builder.exe.debug.238577912" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.target.gnu.builder.exe.debug"/>
|
||||
<builder buildPath="${workspace_loc:/extractor/Debug}" id="cdt.managedbuild.target.gnu.builder.exe.debug.238577912" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" parallelBuildOn="true" parallelizationNumber="optimal" superClass="cdt.managedbuild.target.gnu.builder.exe.debug"/>
|
||||
<tool id="cdt.managedbuild.tool.gnu.archiver.base.1956867596" name="GCC Archiver" superClass="cdt.managedbuild.tool.gnu.archiver.base"/>
|
||||
<tool id="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug.1512268277" name="GCC C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug">
|
||||
<option id="gnu.cpp.compiler.exe.debug.option.optimization.level.2143789149" name="Optimization Level" superClass="gnu.cpp.compiler.exe.debug.option.optimization.level" value="gnu.cpp.compiler.optimization.level.none" valueType="enumerated"/>
|
||||
<option id="gnu.cpp.compiler.exe.debug.option.debugging.level.285958391" name="Debug Level" superClass="gnu.cpp.compiler.exe.debug.option.debugging.level" value="gnu.cpp.compiler.debugging.level.max" valueType="enumerated"/>
|
||||
<option id="gnu.cpp.compiler.option.include.paths.966722418" superClass="gnu.cpp.compiler.option.include.paths" valueType="includePath">
|
||||
<listOptionValue builtIn="false" value=""${workspace_loc}/../../boost/include""/>
|
||||
</option>
|
||||
<inputType id="cdt.managedbuild.tool.gnu.cpp.compiler.input.1839105433" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.input"/>
|
||||
</tool>
|
||||
<tool id="cdt.managedbuild.tool.gnu.c.compiler.exe.debug.554846982" name="GCC C Compiler" superClass="cdt.managedbuild.tool.gnu.c.compiler.exe.debug">
|
||||
@ -119,5 +122,13 @@
|
||||
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC"/>
|
||||
</scannerConfigBuildInfo>
|
||||
</storageModule>
|
||||
<storageModule moduleId="refreshScope"/>
|
||||
<storageModule moduleId="refreshScope" versionNumber="2">
|
||||
<configuration configurationName="Release">
|
||||
<resource resourceType="PROJECT" workspacePath="/extractor"/>
|
||||
</configuration>
|
||||
<configuration configurationName="Debug">
|
||||
<resource resourceType="PROJECT" workspacePath="/extractor"/>
|
||||
</configuration>
|
||||
</storageModule>
|
||||
<storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/>
|
||||
</cproject>
|
||||
|
@ -24,7 +24,7 @@
|
||||
<folderInfo id="cdt.managedbuild.config.gnu.macosx.exe.debug.351042750." name="/" resourcePath="">
|
||||
<toolChain id="cdt.managedbuild.toolchain.gnu.macosx.exe.debug.640882096" name="MacOSX GCC" superClass="cdt.managedbuild.toolchain.gnu.macosx.exe.debug">
|
||||
<targetPlatform binaryParser="org.eclipse.cdt.core.MachO64;org.eclipse.cdt.core.ELF" id="cdt.managedbuild.target.gnu.platform.macosx.exe.debug.793478365" name="Debug Platform" superClass="cdt.managedbuild.target.gnu.platform.macosx.exe.debug"/>
|
||||
<builder buildPath="${workspace_loc:/lm/Debug}" id="cdt.managedbuild.target.gnu.builder.macosx.exe.debug.36011795" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.target.gnu.builder.macosx.exe.debug"/>
|
||||
<builder buildPath="${workspace_loc:/lm/Debug}" id="cdt.managedbuild.target.gnu.builder.macosx.exe.debug.36011795" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" parallelBuildOn="true" parallelizationNumber="optimal" superClass="cdt.managedbuild.target.gnu.builder.macosx.exe.debug"/>
|
||||
<tool id="cdt.managedbuild.tool.macosx.c.linker.macosx.exe.debug.1252826468" name="MacOS X C Linker" superClass="cdt.managedbuild.tool.macosx.c.linker.macosx.exe.debug"/>
|
||||
<tool id="cdt.managedbuild.tool.macosx.cpp.linker.macosx.exe.debug.1024598065" name="MacOS X C++ Linker" superClass="cdt.managedbuild.tool.macosx.cpp.linker.macosx.exe.debug">
|
||||
<inputType id="cdt.managedbuild.tool.macosx.cpp.linker.input.139111896" superClass="cdt.managedbuild.tool.macosx.cpp.linker.input">
|
||||
@ -131,7 +131,14 @@
|
||||
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP"/>
|
||||
</scannerConfigBuildInfo>
|
||||
</storageModule>
|
||||
<storageModule moduleId="refreshScope"/>
|
||||
<storageModule moduleId="refreshScope" versionNumber="2">
|
||||
<configuration configurationName="Release">
|
||||
<resource resourceType="PROJECT" workspacePath="/lm"/>
|
||||
</configuration>
|
||||
<configuration configurationName="Debug">
|
||||
<resource resourceType="PROJECT" workspacePath="/lm"/>
|
||||
</configuration>
|
||||
</storageModule>
|
||||
<storageModule moduleId="org.eclipse.cdt.make.core.buildtargets"/>
|
||||
<storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/>
|
||||
</cproject>
|
||||
|
@ -141,11 +141,6 @@
|
||||
<type>1</type>
|
||||
<locationURI>PARENT-3-PROJECT_LOC/lm/build_binary</locationURI>
|
||||
</link>
|
||||
<link>
|
||||
<name>build_binary.cc</name>
|
||||
<type>1</type>
|
||||
<locationURI>PARENT-3-PROJECT_LOC/lm/build_binary.cc</locationURI>
|
||||
</link>
|
||||
<link>
|
||||
<name>clean.sh</name>
|
||||
<type>1</type>
|
||||
@ -176,11 +171,6 @@
|
||||
<type>1</type>
|
||||
<locationURI>PARENT-3-PROJECT_LOC/lm/facade.hh</locationURI>
|
||||
</link>
|
||||
<link>
|
||||
<name>fragment.cc</name>
|
||||
<type>1</type>
|
||||
<locationURI>PARENT-3-PROJECT_LOC/lm/fragment.cc</locationURI>
|
||||
</link>
|
||||
<link>
|
||||
<name>left.hh</name>
|
||||
<type>1</type>
|
||||
@ -211,11 +201,6 @@
|
||||
<type>1</type>
|
||||
<locationURI>PARENT-3-PROJECT_LOC/lm/lm_exception.hh</locationURI>
|
||||
</link>
|
||||
<link>
|
||||
<name>max_order.cc</name>
|
||||
<type>1</type>
|
||||
<locationURI>PARENT-3-PROJECT_LOC/lm/max_order.cc</locationURI>
|
||||
</link>
|
||||
<link>
|
||||
<name>max_order.hh</name>
|
||||
<type>1</type>
|
||||
@ -241,11 +226,6 @@
|
||||
<type>1</type>
|
||||
<locationURI>PARENT-3-PROJECT_LOC/lm/model_type.hh</locationURI>
|
||||
</link>
|
||||
<link>
|
||||
<name>ngram_query.cc</name>
|
||||
<type>1</type>
|
||||
<locationURI>PARENT-3-PROJECT_LOC/lm/ngram_query.cc</locationURI>
|
||||
</link>
|
||||
<link>
|
||||
<name>ngram_query.hh</name>
|
||||
<type>1</type>
|
||||
|
@ -7,7 +7,7 @@
|
||||
<externalSetting>
|
||||
<entry flags="VALUE_WORKSPACE_PATH" kind="includePath" name="/mert_lib"/>
|
||||
<entry flags="VALUE_WORKSPACE_PATH" kind="libraryPath" name="/mert_lib/Debug"/>
|
||||
<entry flags="RESOLVED" kind="libraryFile" name="mert_lib"/>
|
||||
<entry flags="RESOLVED" kind="libraryFile" name="mert_lib" srcPrefixMapping="" srcRootPath=""/>
|
||||
</externalSetting>
|
||||
</externalSettings>
|
||||
<extensions>
|
||||
@ -23,13 +23,14 @@
|
||||
<folderInfo id="cdt.managedbuild.config.gnu.lib.debug.1721952013." name="/" resourcePath="">
|
||||
<toolChain id="cdt.managedbuild.toolchain.gnu.lib.debug.1932340583" name="Linux GCC" superClass="cdt.managedbuild.toolchain.gnu.lib.debug">
|
||||
<targetPlatform id="cdt.managedbuild.target.gnu.platform.lib.debug.296711714" name="Debug Platform" superClass="cdt.managedbuild.target.gnu.platform.lib.debug"/>
|
||||
<builder buildPath="${workspace_loc:/mert_lib/Debug}" id="cdt.managedbuild.target.gnu.builder.lib.debug.1369910974" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.target.gnu.builder.lib.debug"/>
|
||||
<builder buildPath="${workspace_loc:/mert_lib/Debug}" id="cdt.managedbuild.target.gnu.builder.lib.debug.1369910974" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" parallelBuildOn="true" parallelizationNumber="optimal" superClass="cdt.managedbuild.target.gnu.builder.lib.debug"/>
|
||||
<tool id="cdt.managedbuild.tool.gnu.archiver.lib.debug.89397980" name="GCC Archiver" superClass="cdt.managedbuild.tool.gnu.archiver.lib.debug"/>
|
||||
<tool id="cdt.managedbuild.tool.gnu.cpp.compiler.lib.debug.329920537" name="GCC C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.lib.debug">
|
||||
<option id="gnu.cpp.compiler.lib.debug.option.optimization.level.469164841" name="Optimization Level" superClass="gnu.cpp.compiler.lib.debug.option.optimization.level" value="gnu.cpp.compiler.optimization.level.none" valueType="enumerated"/>
|
||||
<option id="gnu.cpp.compiler.lib.debug.option.debugging.level.1050747398" name="Debug Level" superClass="gnu.cpp.compiler.lib.debug.option.debugging.level" value="gnu.cpp.compiler.debugging.level.max" valueType="enumerated"/>
|
||||
<option id="gnu.cpp.compiler.option.include.paths.1565260476" name="Include paths (-I)" superClass="gnu.cpp.compiler.option.include.paths" valueType="includePath">
|
||||
<listOptionValue builtIn="false" value=""${workspace_loc}/../../""/>
|
||||
<listOptionValue builtIn="false" value=""${workspace_loc}/../../boost/include""/>
|
||||
</option>
|
||||
<inputType id="cdt.managedbuild.tool.gnu.cpp.compiler.input.1183866856" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.input"/>
|
||||
</tool>
|
||||
@ -45,11 +46,8 @@
|
||||
</tool>
|
||||
</toolChain>
|
||||
</folderInfo>
|
||||
<fileInfo id="cdt.managedbuild.config.gnu.lib.debug.1721952013.626295813" name="extractor.cpp" rcbsApplicability="disable" resourcePath="mert/extractor.cpp" toolsToInvoke="cdt.managedbuild.tool.gnu.cpp.compiler.lib.debug.329920537.1550378460">
|
||||
<tool id="cdt.managedbuild.tool.gnu.cpp.compiler.lib.debug.329920537.1550378460" name="GCC C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.lib.debug.329920537"/>
|
||||
</fileInfo>
|
||||
<sourceEntries>
|
||||
<entry excluding="mert/extractor.cpp" flags="VALUE_WORKSPACE_PATH|RESOLVED" kind="sourcePath" name=""/>
|
||||
<entry excluding="mert/UtilTest.cpp|mert/TimerTest.cpp|mert/SingletonTest.cpp|mert/PointTest.cpp|mert/OptimizerFactoryTest.cpp|mert/NgramTest.cpp|mert/FeatureDataTest.cpp|mert/DataTest.cpp|mert/ReferenceTest.cpp|mert/VocabularyTest.cpp|mert/extractor.cpp" flags="VALUE_WORKSPACE_PATH|RESOLVED" kind="sourcePath" name=""/>
|
||||
</sourceEntries>
|
||||
</configuration>
|
||||
</storageModule>
|
||||
@ -61,7 +59,7 @@
|
||||
<externalSetting>
|
||||
<entry flags="VALUE_WORKSPACE_PATH" kind="includePath" name="/mert_lib"/>
|
||||
<entry flags="VALUE_WORKSPACE_PATH" kind="libraryPath" name="/mert_lib/Release"/>
|
||||
<entry flags="RESOLVED" kind="libraryFile" name="mert_lib"/>
|
||||
<entry flags="RESOLVED" kind="libraryFile" name="mert_lib" srcPrefixMapping="" srcRootPath=""/>
|
||||
</externalSetting>
|
||||
</externalSettings>
|
||||
<extensions>
|
||||
@ -119,5 +117,13 @@
|
||||
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC"/>
|
||||
</scannerConfigBuildInfo>
|
||||
</storageModule>
|
||||
<storageModule moduleId="refreshScope"/>
|
||||
<storageModule moduleId="refreshScope" versionNumber="2">
|
||||
<configuration configurationName="Release">
|
||||
<resource resourceType="PROJECT" workspacePath="/mert_lib"/>
|
||||
</configuration>
|
||||
<configuration configurationName="Debug">
|
||||
<resource resourceType="PROJECT" workspacePath="/mert_lib"/>
|
||||
</configuration>
|
||||
</storageModule>
|
||||
<storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/>
|
||||
</cproject>
|
||||
|
@ -19,7 +19,7 @@
|
||||
<folderInfo id="cdt.managedbuild.config.gnu.exe.debug.162355801." name="/" resourcePath="">
|
||||
<toolChain id="cdt.managedbuild.toolchain.gnu.exe.debug.1633424067" name="Linux GCC" superClass="cdt.managedbuild.toolchain.gnu.exe.debug">
|
||||
<targetPlatform binaryParser="org.eclipse.cdt.core.ELF;org.eclipse.cdt.core.MachO64" id="cdt.managedbuild.target.gnu.platform.exe.debug.1437309068" name="Debug Platform" superClass="cdt.managedbuild.target.gnu.platform.exe.debug"/>
|
||||
<builder buildPath="${workspace_loc:/moses-chart-cmd/Debug}" id="cdt.managedbuild.target.gnu.builder.exe.debug.1495140314" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.target.gnu.builder.exe.debug"/>
|
||||
<builder buildPath="${workspace_loc:/moses-chart-cmd/Debug}" id="cdt.managedbuild.target.gnu.builder.exe.debug.1495140314" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" parallelBuildOn="true" parallelizationNumber="optimal" superClass="cdt.managedbuild.target.gnu.builder.exe.debug"/>
|
||||
<tool id="cdt.managedbuild.tool.gnu.archiver.base.1247128100" name="GCC Archiver" superClass="cdt.managedbuild.tool.gnu.archiver.base"/>
|
||||
<tool id="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug.1087697480" name="GCC C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug">
|
||||
<option id="gnu.cpp.compiler.exe.debug.option.optimization.level.1163099464" name="Optimization Level" superClass="gnu.cpp.compiler.exe.debug.option.optimization.level" value="gnu.cpp.compiler.optimization.level.none" valueType="enumerated"/>
|
||||
@ -46,6 +46,7 @@
|
||||
<tool id="cdt.managedbuild.tool.gnu.cpp.linker.exe.debug.816413868" name="GCC C++ Linker" superClass="cdt.managedbuild.tool.gnu.cpp.linker.exe.debug">
|
||||
<option id="gnu.cpp.link.option.paths.330225535" name="Library search path (-L)" superClass="gnu.cpp.link.option.paths" valueType="libPaths">
|
||||
<listOptionValue builtIn="false" value=""${workspace_loc:}/../../boost/lib""/>
|
||||
<listOptionValue builtIn="false" value=""${workspace_loc:}/../../boost/lib64""/>
|
||||
<listOptionValue builtIn="false" value=""${workspace_loc:}/../../irstlm/lib""/>
|
||||
<listOptionValue builtIn="false" value=""${workspace_loc:}/../../srilm/lib/macosx""/>
|
||||
<listOptionValue builtIn="false" value=""${workspace_loc:}/../../srilm/lib/i686-m64""/>
|
||||
@ -154,8 +155,13 @@
|
||||
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP"/>
|
||||
</scannerConfigBuildInfo>
|
||||
</storageModule>
|
||||
<storageModule moduleId="refreshScope" versionNumber="1">
|
||||
<storageModule moduleId="refreshScope" versionNumber="2">
|
||||
<configuration configurationName="Release">
|
||||
<resource resourceType="PROJECT" workspacePath="/moses-chart-cmd"/>
|
||||
</configuration>
|
||||
<configuration configurationName="Debug">
|
||||
<resource resourceType="PROJECT" workspacePath="/moses-chart-cmd"/>
|
||||
</configuration>
|
||||
</storageModule>
|
||||
<storageModule moduleId="org.eclipse.cdt.make.core.buildtargets"/>
|
||||
<storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/>
|
||||
|
@ -19,7 +19,7 @@
|
||||
<folderInfo id="cdt.managedbuild.config.gnu.exe.debug.461114338." name="/" resourcePath="">
|
||||
<toolChain id="cdt.managedbuild.toolchain.gnu.exe.debug.1896491482" name="Linux GCC" superClass="cdt.managedbuild.toolchain.gnu.exe.debug">
|
||||
<targetPlatform binaryParser="org.eclipse.cdt.core.ELF;org.eclipse.cdt.core.MachO64" id="cdt.managedbuild.target.gnu.platform.exe.debug.2144309834" name="Debug Platform" superClass="cdt.managedbuild.target.gnu.platform.exe.debug"/>
|
||||
<builder buildPath="${workspace_loc:/moses-cmd/Debug}" id="cdt.managedbuild.target.gnu.builder.exe.debug.56664170" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.target.gnu.builder.exe.debug"/>
|
||||
<builder buildPath="${workspace_loc:/moses-cmd/Debug}" id="cdt.managedbuild.target.gnu.builder.exe.debug.56664170" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" parallelBuildOn="true" parallelizationNumber="optimal" superClass="cdt.managedbuild.target.gnu.builder.exe.debug"/>
|
||||
<tool id="cdt.managedbuild.tool.gnu.archiver.base.1278274354" name="GCC Archiver" superClass="cdt.managedbuild.tool.gnu.archiver.base"/>
|
||||
<tool id="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug.626095182" name="GCC C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug">
|
||||
<option id="gnu.cpp.compiler.exe.debug.option.optimization.level.2084031389" name="Optimization Level" superClass="gnu.cpp.compiler.exe.debug.option.optimization.level" value="gnu.cpp.compiler.optimization.level.none" valueType="enumerated"/>
|
||||
@ -46,6 +46,8 @@
|
||||
<tool id="cdt.managedbuild.tool.gnu.cpp.linker.exe.debug.1546774818" name="GCC C++ Linker" superClass="cdt.managedbuild.tool.gnu.cpp.linker.exe.debug">
|
||||
<option id="gnu.cpp.link.option.paths.523170942" name="Library search path (-L)" superClass="gnu.cpp.link.option.paths" valueType="libPaths">
|
||||
<listOptionValue builtIn="false" value=""${workspace_loc:}/../../irstlm/lib""/>
|
||||
<listOptionValue builtIn="false" value=""${workspace_loc:}/../../boost/lib""/>
|
||||
<listOptionValue builtIn="false" value=""${workspace_loc:}/../../boost/lib64""/>
|
||||
<listOptionValue builtIn="false" value=""${workspace_loc:}/../../srilm/lib/macosx""/>
|
||||
<listOptionValue builtIn="false" value=""${workspace_loc:}/../../srilm/lib/i686-m64""/>
|
||||
<listOptionValue builtIn="false" value=""${workspace_loc:}/../../srilm/lib/i686""/>
|
||||
@ -155,8 +157,13 @@
|
||||
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP"/>
|
||||
</scannerConfigBuildInfo>
|
||||
</storageModule>
|
||||
<storageModule moduleId="refreshScope" versionNumber="1">
|
||||
<storageModule moduleId="refreshScope" versionNumber="2">
|
||||
<configuration configurationName="Release">
|
||||
<resource resourceType="PROJECT" workspacePath="/moses-cmd"/>
|
||||
</configuration>
|
||||
<configuration configurationName="Debug">
|
||||
<resource resourceType="PROJECT" workspacePath="/moses-cmd"/>
|
||||
</configuration>
|
||||
</storageModule>
|
||||
<storageModule moduleId="org.eclipse.cdt.make.core.buildtargets"/>
|
||||
<storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/>
|
||||
|
@ -1,7 +1,5 @@
|
||||
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
|
||||
<?fileVersion 4.0.0?>
|
||||
|
||||
<cproject storage_type_id="org.eclipse.cdt.core.XmlProjectDescriptionStorage">
|
||||
<?fileVersion 4.0.0?><cproject storage_type_id="org.eclipse.cdt.core.XmlProjectDescriptionStorage">
|
||||
<storageModule moduleId="org.eclipse.cdt.core.settings">
|
||||
<cconfiguration id="cdt.managedbuild.config.gnu.exe.debug.656913512">
|
||||
<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.exe.debug.656913512" moduleId="org.eclipse.cdt.core.settings" name="Debug">
|
||||
@ -9,7 +7,7 @@
|
||||
<externalSetting>
|
||||
<entry flags="VALUE_WORKSPACE_PATH" kind="includePath" name="/moses"/>
|
||||
<entry flags="VALUE_WORKSPACE_PATH" kind="libraryPath" name="/moses/Debug"/>
|
||||
<entry flags="RESOLVED" kind="libraryFile" name="moses"/>
|
||||
<entry flags="RESOLVED" kind="libraryFile" name="moses" srcPrefixMapping="" srcRootPath=""/>
|
||||
</externalSetting>
|
||||
</externalSettings>
|
||||
<extensions>
|
||||
@ -26,7 +24,7 @@
|
||||
<folderInfo id="cdt.managedbuild.config.gnu.exe.debug.656913512." name="/" resourcePath="">
|
||||
<toolChain id="cdt.managedbuild.toolchain.gnu.exe.debug.1793369992" name="Linux GCC" superClass="cdt.managedbuild.toolchain.gnu.exe.debug">
|
||||
<targetPlatform binaryParser="org.eclipse.cdt.core.ELF;org.eclipse.cdt.core.MachO64" id="cdt.managedbuild.target.gnu.platform.exe.debug.1051650049" name="Debug Platform" superClass="cdt.managedbuild.target.gnu.platform.exe.debug"/>
|
||||
<builder buildPath="${workspace_loc:/moses/Debug}" id="cdt.managedbuild.target.gnu.builder.exe.debug.505583888" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.target.gnu.builder.exe.debug"/>
|
||||
<builder buildPath="${workspace_loc:/moses/Debug}" id="cdt.managedbuild.target.gnu.builder.exe.debug.505583888" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" parallelBuildOn="true" parallelizationNumber="optimal" superClass="cdt.managedbuild.target.gnu.builder.exe.debug"/>
|
||||
<tool id="cdt.managedbuild.tool.gnu.archiver.base.1976472988" name="GCC Archiver" superClass="cdt.managedbuild.tool.gnu.archiver.base"/>
|
||||
<tool id="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug.1774992327" name="GCC C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug">
|
||||
<option id="gnu.cpp.compiler.exe.debug.option.optimization.level.1759650532" name="Optimization Level" superClass="gnu.cpp.compiler.exe.debug.option.optimization.level" value="gnu.cpp.compiler.optimization.level.none" valueType="enumerated"/>
|
||||
@ -152,8 +150,14 @@
|
||||
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP"/>
|
||||
</scannerConfigBuildInfo>
|
||||
</storageModule>
|
||||
<storageModule moduleId="refreshScope" versionNumber="1">
|
||||
<storageModule moduleId="refreshScope" versionNumber="2">
|
||||
<configuration configurationName="Release">
|
||||
<resource resourceType="PROJECT" workspacePath="/moses"/>
|
||||
</configuration>
|
||||
<configuration configurationName="Debug">
|
||||
<resource resourceType="PROJECT" workspacePath="/moses"/>
|
||||
</configuration>
|
||||
</storageModule>
|
||||
<storageModule moduleId="org.eclipse.cdt.make.core.buildtargets"/>
|
||||
<storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/>
|
||||
</cproject>
|
||||
|
@ -24,7 +24,7 @@
|
||||
<folderInfo id="cdt.managedbuild.config.gnu.exe.debug.722547278." name="/" resourcePath="">
|
||||
<toolChain id="cdt.managedbuild.toolchain.gnu.exe.debug.1512691763" name="Linux GCC" superClass="cdt.managedbuild.toolchain.gnu.exe.debug">
|
||||
<targetPlatform binaryParser="org.eclipse.cdt.core.ELF;org.eclipse.cdt.core.MachO64" id="cdt.managedbuild.target.gnu.platform.exe.debug.633526059" name="Debug Platform" superClass="cdt.managedbuild.target.gnu.platform.exe.debug"/>
|
||||
<builder buildPath="${workspace_loc:/search/Debug}" id="cdt.managedbuild.target.gnu.builder.exe.debug.164367197" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.target.gnu.builder.exe.debug"/>
|
||||
<builder buildPath="${workspace_loc:/search/Debug}" id="cdt.managedbuild.target.gnu.builder.exe.debug.164367197" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" parallelBuildOn="true" parallelizationNumber="optimal" superClass="cdt.managedbuild.target.gnu.builder.exe.debug"/>
|
||||
<tool id="cdt.managedbuild.tool.gnu.archiver.base.854512708" name="GCC Archiver" superClass="cdt.managedbuild.tool.gnu.archiver.base"/>
|
||||
<tool id="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug.1096845166" name="GCC C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug">
|
||||
<option id="gnu.cpp.compiler.exe.debug.option.optimization.level.240381177" name="Optimization Level" superClass="gnu.cpp.compiler.exe.debug.option.optimization.level" value="gnu.cpp.compiler.optimization.level.none" valueType="enumerated"/>
|
||||
@ -127,6 +127,13 @@
|
||||
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC"/>
|
||||
</scannerConfigBuildInfo>
|
||||
</storageModule>
|
||||
<storageModule moduleId="refreshScope"/>
|
||||
<storageModule moduleId="refreshScope" versionNumber="2">
|
||||
<configuration configurationName="Release">
|
||||
<resource resourceType="PROJECT" workspacePath="/search"/>
|
||||
</configuration>
|
||||
<configuration configurationName="Debug">
|
||||
<resource resourceType="PROJECT" workspacePath="/search"/>
|
||||
</configuration>
|
||||
</storageModule>
|
||||
<storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/>
|
||||
</cproject>
|
||||
|
@ -156,11 +156,6 @@
|
||||
<type>1</type>
|
||||
<locationURI>PARENT-3-PROJECT_LOC/search/vertex.hh</locationURI>
|
||||
</link>
|
||||
<link>
|
||||
<name>vertex_generator.cc</name>
|
||||
<type>1</type>
|
||||
<locationURI>PARENT-3-PROJECT_LOC/search/vertex_generator.cc</locationURI>
|
||||
</link>
|
||||
<link>
|
||||
<name>vertex_generator.hh</name>
|
||||
<type>1</type>
|
||||
|
@ -24,7 +24,7 @@
|
||||
<folderInfo id="cdt.managedbuild.config.gnu.macosx.exe.debug.1869657447." name="/" resourcePath="">
|
||||
<toolChain id="cdt.managedbuild.toolchain.gnu.macosx.exe.debug.1388624938" name="MacOSX GCC" superClass="cdt.managedbuild.toolchain.gnu.macosx.exe.debug">
|
||||
<targetPlatform binaryParser="org.eclipse.cdt.core.MachO64;org.eclipse.cdt.core.ELF" id="cdt.managedbuild.target.gnu.platform.macosx.exe.debug.1873607607" name="Debug Platform" superClass="cdt.managedbuild.target.gnu.platform.macosx.exe.debug"/>
|
||||
<builder buildPath="${workspace_loc:/util/Debug}" id="cdt.managedbuild.target.gnu.builder.macosx.exe.debug.2045214944" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.target.gnu.builder.macosx.exe.debug"/>
|
||||
<builder buildPath="${workspace_loc:/util/Debug}" id="cdt.managedbuild.target.gnu.builder.macosx.exe.debug.2045214944" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" parallelBuildOn="true" parallelizationNumber="optimal" superClass="cdt.managedbuild.target.gnu.builder.macosx.exe.debug"/>
|
||||
<tool id="cdt.managedbuild.tool.macosx.c.linker.macosx.exe.debug.589471640" name="MacOS X C Linker" superClass="cdt.managedbuild.tool.macosx.c.linker.macosx.exe.debug"/>
|
||||
<tool id="cdt.managedbuild.tool.macosx.cpp.linker.macosx.exe.debug.1543780089" name="MacOS X C++ Linker" superClass="cdt.managedbuild.tool.macosx.cpp.linker.macosx.exe.debug">
|
||||
<inputType id="cdt.managedbuild.tool.macosx.cpp.linker.input.635667684" superClass="cdt.managedbuild.tool.macosx.cpp.linker.input">
|
||||
@ -136,8 +136,13 @@
|
||||
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC"/>
|
||||
</scannerConfigBuildInfo>
|
||||
</storageModule>
|
||||
<storageModule moduleId="refreshScope" versionNumber="1">
|
||||
<storageModule moduleId="refreshScope" versionNumber="2">
|
||||
<configuration configurationName="Release">
|
||||
<resource resourceType="PROJECT" workspacePath="/util"/>
|
||||
</configuration>
|
||||
<configuration configurationName="Debug">
|
||||
<resource resourceType="PROJECT" workspacePath="/util"/>
|
||||
</configuration>
|
||||
</storageModule>
|
||||
<storageModule moduleId="org.eclipse.cdt.make.core.buildtargets"/>
|
||||
<storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/>
|
||||
|
42
contrib/rpm/README
Normal file
42
contrib/rpm/README
Normal file
@ -0,0 +1,42 @@
|
||||
Building Moses RPM
|
||||
==================
|
||||
|
||||
*** WARNING ***
|
||||
Before completing *any* of the tasks outlined in this README, please commit and push any changes you wish to be included in your installer.
|
||||
*** WARNING ***
|
||||
|
||||
|
||||
Building the RPM SPEC file
|
||||
--------------------------
|
||||
|
||||
The first phase is to construct the RPM SPEC file in $HOME/rpmbuild. The build_source.sh script builds all the artefacts needed to build. This script needs the following information:
|
||||
|
||||
- The Git repository from which an installer will be built,
|
||||
- The branch in the Git repository to build, and
|
||||
- The version of the installed Moses distribution.
|
||||
|
||||
For example, to build the RELEASE-1.0 branch in the mosesdecode repository (git://github.com/moses-smt/mosesdecoder.git):
|
||||
|
||||
$ build_source.sh -r git://github.com/moses-smt/mosesdecoder.git -b RELASE-1.0 -v 1.0
|
||||
|
||||
This builds the source tarballs in the $HOME/rpmbuild/SOURCES directory and the moses.spec file in $HOME/rpmbuild/SPECS.
|
||||
|
||||
|
||||
Building the RPM
|
||||
----------------
|
||||
|
||||
Change directory to $HOME/rpmbuild, and build the binary RPM with:
|
||||
|
||||
$ rpmbuild -bb SPECS/moses.spec
|
||||
|
||||
This will download IRSTLM v5.70.04 and GIZA++ v2, then build them along with Moses and make the RPM in the directory $HOME/rpmbuild/RPMS/<architecture>/moses-<version>-1.<architecture>.rpm.
|
||||
|
||||
For example building on a 64 bit Intel architecture, and building v1.0 the RPM would be called moses-1.0-1.x86_64.rpm.
|
||||
|
||||
|
||||
Building a Debian package
|
||||
-------------------------
|
||||
|
||||
The Alien tool converts RPM packages to Debian packages. If a Debian package is required then follow the instructions on the following web-page:
|
||||
|
||||
https://help.ubuntu.com/community/RPM/AlienHowto
|
63
contrib/rpm/build_source.sh
Executable file
63
contrib/rpm/build_source.sh
Executable file
@ -0,0 +1,63 @@
|
||||
#!/bin/bash
|
||||
|
||||
BRANCH="master"
|
||||
declare -i NO_RPM_BUILD=0
|
||||
declare -r RPM_VERSION_TAG="___RPM_VERSION__"
|
||||
|
||||
function usage() {
|
||||
echo "`basename $0` -r [Moses Git repo] -b [Moses Git branch: default ${BRANCH}] -v [RPM version]"
|
||||
exit 1
|
||||
}
|
||||
|
||||
if [ $# -lt 4 ]; then
|
||||
usage
|
||||
fi
|
||||
|
||||
while getopts r:b:v:nh OPTION
|
||||
do
|
||||
case "$OPTION" in
|
||||
r) REPO="${OPTARG}";;
|
||||
b) BRANCH="${OPTARG}";;
|
||||
v) VERSION="${OPTARG}";;
|
||||
n) NO_RPM_BUILD=1;;
|
||||
[h\?]) usage;;
|
||||
esac
|
||||
done
|
||||
|
||||
if [ ! -d ./rpmbuild ]; then
|
||||
echo "RPM build directory not in current working direcotry"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
declare -r MOSES_DIR="moses-${VERSION}"
|
||||
git clone ${REPO} ${MOSES_DIR}
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "Failed to clone Git repository ${REPO}"
|
||||
exit 3
|
||||
fi
|
||||
|
||||
cd ${MOSES_DIR}
|
||||
|
||||
git checkout ${BRANCH}
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "Failed to checkout branch ${BRANCH}"
|
||||
exit 3
|
||||
fi
|
||||
|
||||
cd ..
|
||||
|
||||
tar -cf moses-${VERSION}.tar ${MOSES_DIR}
|
||||
gzip -f9 moses-${VERSION}.tar
|
||||
|
||||
if [ ${NO_RPM_BUILD} -eq 0 ]; then
|
||||
if [ ! -d ${HOME}/rpmbuild/SPECS ]; then
|
||||
mkdir -p ${HOME}/rpmbuild/SPECS
|
||||
fi
|
||||
eval sed s/${RPM_VERSION_TAG}/${VERSION}/ ./rpmbuild/SPECS/moses.spec > ${HOME}/rpmbuild/SPECS/moses.spec
|
||||
if [ ! -d ${HOME}/rpmbuild/SOURCES ]; then
|
||||
mkdir -p ${HOME}/rpmbuild/SOURCES
|
||||
fi
|
||||
mv moses-${VERSION}.tar.gz ${HOME}/rpmbuild/SOURCES
|
||||
fi
|
||||
|
||||
rm -Rf ${MOSES_DIR}
|
65
contrib/rpm/rpmbuild/SPECS/moses.spec
Normal file
65
contrib/rpm/rpmbuild/SPECS/moses.spec
Normal file
@ -0,0 +1,65 @@
|
||||
Name: moses
|
||||
Summary: Moses is a statistical machine translation system that allows you to automatically train translation models for any language pair.
|
||||
Version: ___RPM_VERSION__
|
||||
Release: 1
|
||||
URL: http://www.statmt.org/moses/
|
||||
Source0: %{name}-%{version}.tar.gz
|
||||
License: LGPL
|
||||
Group: Development/Tools
|
||||
Vendor: Capita Translation and Interpreting
|
||||
Packager: Ian Johnson <ian.johnson@capita-ti.com>
|
||||
Requires: boost >= 1.48, python >= 2.6, perl >= 5
|
||||
BuildRoot: /home/ian/rpmbuild/builds/%{name}-%{version}-%{release}
|
||||
%description
|
||||
Moses is a statistical machine translation system that allows you to automatically train translation models for any language pair. All you need is a collection of translated texts (parallel corpus). An efficient search algorithm finds quickly the highest probability translation among the exponential number of choices.
|
||||
%prep
|
||||
%setup -q
|
||||
|
||||
mkdir -p $RPM_BUILD_ROOT/opt/moses/giza++-v1.0.7
|
||||
|
||||
wget -O $RPM_BUILD_DIR/irstlm-5.70.04.tgz http://moses-suite.googlecode.com/files/irstlm-5.70.04.tgz
|
||||
wget -O $RPM_BUILD_DIR/giza-pp-v1.0.7.tgz http://moses-suite.googlecode.com/files/giza-pp-v1.0.7.tar.gz
|
||||
|
||||
cd $RPM_BUILD_DIR
|
||||
|
||||
tar -zxf irstlm-5.70.04.tgz
|
||||
tar -zxf giza-pp-v1.0.7.tgz
|
||||
|
||||
cd irstlm-5.70.04
|
||||
bash regenerate-makefiles.sh --force
|
||||
./configure --prefix $RPM_BUILD_ROOT/opt/moses/irstlm-5.70.04
|
||||
make
|
||||
make install
|
||||
|
||||
cd ../giza-pp
|
||||
make
|
||||
cp $RPM_BUILD_DIR/giza-pp/GIZA++-v2/GIZA++ $RPM_BUILD_DIR/giza-pp/GIZA++-v2/snt2cooc.out $RPM_BUILD_DIR/giza-pp/mkcls-v2/mkcls $RPM_BUILD_ROOT/opt/moses/giza++-v1.0.7
|
||||
%build
|
||||
./bjam --with-irstlm=$RPM_BUILD_ROOT/opt/moses/irstlm-5.70.04 --with-giza=$RPM_BUILD_ROOT/opt/moses/giza++-v1.0.7 -j2
|
||||
%install
|
||||
mkdir -p $RPM_BUILD_ROOT/opt/moses/scripts
|
||||
cp -R bin $RPM_BUILD_ROOT/opt/moses
|
||||
cp -R scripts/analysis $RPM_BUILD_ROOT/opt/moses/scripts
|
||||
cp -R scripts/ems $RPM_BUILD_ROOT/opt/moses/scripts
|
||||
cp -R scripts/generic $RPM_BUILD_ROOT/opt/moses/scripts
|
||||
cp -R scripts/other $RPM_BUILD_ROOT/opt/moses/scripts
|
||||
cp -R scripts/recaser $RPM_BUILD_ROOT/opt/moses/scripts
|
||||
cp -R scripts/regression-testing $RPM_BUILD_ROOT/opt/moses/scripts
|
||||
cp -R scripts/share $RPM_BUILD_ROOT/opt/moses/scripts
|
||||
cp -R scripts/tokenizer $RPM_BUILD_ROOT/opt/moses/scripts
|
||||
cp -R scripts/training $RPM_BUILD_ROOT/opt/moses/scripts
|
||||
%clean
|
||||
%files
|
||||
%defattr(-,root,root)
|
||||
/opt/moses/bin/*
|
||||
/opt/moses/scripts/analysis/*
|
||||
/opt/moses/scripts/ems/*
|
||||
/opt/moses/scripts/generic/*
|
||||
/opt/moses/scripts/other/*
|
||||
/opt/moses/scripts/recaser/*
|
||||
/opt/moses/scripts/regression-testing/*
|
||||
/opt/moses/scripts/share/*
|
||||
/opt/moses/scripts/tokenizer/*
|
||||
/opt/moses/scripts/training/*
|
||||
/opt/moses/irstlm-5.70.04/*
|
||||
/opt/moses/giza++-v1.0.7/*
|
@ -620,10 +620,27 @@ void IOWrapper::FixPrecision(std::ostream &stream, size_t size)
|
||||
template <class T>
|
||||
void ShiftOffsets(vector<T> &offsets, T shift)
|
||||
{
|
||||
T currPos = shift;
|
||||
for (size_t i = 0; i < offsets.size(); ++i) {
|
||||
shift += offsets[i];
|
||||
offsets[i] += shift;
|
||||
if (offsets[i] == 0) {
|
||||
offsets[i] = currPos;
|
||||
++currPos;
|
||||
}
|
||||
else {
|
||||
currPos += offsets[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
size_t CalcSourceSize(const Moses::ChartHypothesis *hypo)
|
||||
{
|
||||
size_t ret = hypo->GetCurrSourceRange().GetNumWordsCovered();
|
||||
const std::vector<const ChartHypothesis*> &prevHypos = hypo->GetPrevHypos();
|
||||
for (size_t i = 0; i < prevHypos.size(); ++i) {
|
||||
size_t childSize = prevHypos[i]->GetCurrSourceRange().GetNumWordsCovered();
|
||||
ret -= (childSize - 1);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
size_t IOWrapper::OutputAlignmentNBest(Alignments &retAlign, const Moses::ChartTrellisNode &node, size_t startTarget)
|
||||
@ -635,7 +652,11 @@ size_t IOWrapper::OutputAlignmentNBest(Alignments &retAlign, const Moses::ChartT
|
||||
|
||||
const TargetPhrase &tp = hypo->GetCurrTargetPhrase();
|
||||
|
||||
vector<size_t> sourceOffsets(hypo->GetCurrSourceRange().GetNumWordsCovered(), 0);
|
||||
size_t thisSourceSize = CalcSourceSize(hypo);
|
||||
|
||||
// position of each terminal word in translation rule, irrespective of alignment
|
||||
// if non-term, number is undefined
|
||||
vector<size_t> sourceOffsets(thisSourceSize, 0);
|
||||
vector<size_t> targetOffsets(tp.GetSize(), 0);
|
||||
|
||||
const ChartTrellisNode::NodeChildren &prevNodes = node.GetChildren();
|
||||
@ -655,11 +676,12 @@ size_t IOWrapper::OutputAlignmentNBest(Alignments &retAlign, const Moses::ChartT
|
||||
|
||||
const ChartTrellisNode &prevNode = *prevNodes[sourceInd];
|
||||
|
||||
// 1st. calc source size
|
||||
// calc source size
|
||||
size_t sourceSize = prevNode.GetHypothesis().GetCurrSourceRange().GetNumWordsCovered();
|
||||
sourceOffsets[sourcePos] = sourceSize;
|
||||
|
||||
// 2nd. calc target size. Recursively look thru child hypos
|
||||
// calc target size.
|
||||
// Recursively look thru child hypos
|
||||
size_t currStartTarget = startTarget + totalTargetSize;
|
||||
size_t targetSize = OutputAlignmentNBest(retAlign, prevNode, currStartTarget);
|
||||
targetOffsets[targetPos] = targetSize;
|
||||
@ -672,27 +694,26 @@ size_t IOWrapper::OutputAlignmentNBest(Alignments &retAlign, const Moses::ChartT
|
||||
}
|
||||
}
|
||||
|
||||
// 3rd. shift offsets
|
||||
// convert position within translation rule to absolute position within
|
||||
// source sentence / output sentence
|
||||
ShiftOffsets(sourceOffsets, startSource);
|
||||
ShiftOffsets(targetOffsets, startTarget);
|
||||
|
||||
// get alignments from this hypo
|
||||
vector< set<size_t> > retAlignmentsS2T(hypo->GetCurrSourceRange().GetNumWordsCovered());
|
||||
const AlignmentInfo &aiTerm = hypo->GetCurrTargetPhrase().GetAlignTerm();
|
||||
OutputAlignment(retAlignmentsS2T, aiTerm);
|
||||
|
||||
// add to output arg, offsetting by source & target
|
||||
for (size_t source = 0; source < retAlignmentsS2T.size(); ++source) {
|
||||
const set<size_t> &targets = retAlignmentsS2T[source];
|
||||
set<size_t>::const_iterator iter;
|
||||
for (iter = targets.begin(); iter != targets.end(); ++iter) {
|
||||
size_t target = *iter;
|
||||
pair<size_t, size_t> alignPoint(source + sourceOffsets[source]
|
||||
,target + targetOffsets[target]);
|
||||
AlignmentInfo::const_iterator iter;
|
||||
for (iter = aiTerm.begin(); iter != aiTerm.end(); ++iter) {
|
||||
const std::pair<size_t,size_t> &align = *iter;
|
||||
size_t relSource = align.first;
|
||||
size_t relTarget = align.second;
|
||||
size_t absSource = sourceOffsets[relSource];
|
||||
size_t absTarget = targetOffsets[relTarget];
|
||||
|
||||
pair<size_t, size_t> alignPoint(absSource, absTarget);
|
||||
pair<Alignments::iterator, bool> ret = retAlign.insert(alignPoint);
|
||||
CHECK(ret.second);
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
return totalTargetSize;
|
||||
@ -702,6 +723,7 @@ void IOWrapper::OutputAlignment(size_t translationId , const Moses::ChartHypothe
|
||||
{
|
||||
ostringstream out;
|
||||
|
||||
if (hypo) {
|
||||
Alignments retAlign;
|
||||
OutputAlignment(retAlign, hypo, 0);
|
||||
|
||||
@ -711,6 +733,7 @@ void IOWrapper::OutputAlignment(size_t translationId , const Moses::ChartHypothe
|
||||
const pair<size_t, size_t> &alignPoint = *iter;
|
||||
out << alignPoint.first << "-" << alignPoint.second << " ";
|
||||
}
|
||||
}
|
||||
out << endl;
|
||||
|
||||
m_alignmentInfoCollector->Write(translationId, out.str());
|
||||
@ -723,7 +746,11 @@ size_t IOWrapper::OutputAlignment(Alignments &retAlign, const Moses::ChartHypoth
|
||||
|
||||
const TargetPhrase &tp = hypo->GetCurrTargetPhrase();
|
||||
|
||||
vector<size_t> sourceOffsets(hypo->GetCurrSourceRange().GetNumWordsCovered(), 0);
|
||||
size_t thisSourceSize = CalcSourceSize(hypo);
|
||||
|
||||
// position of each terminal word in translation rule, irrespective of alignment
|
||||
// if non-term, number is undefined
|
||||
vector<size_t> sourceOffsets(thisSourceSize, 0);
|
||||
vector<size_t> targetOffsets(tp.GetSize(), 0);
|
||||
|
||||
const vector<const ChartHypothesis*> &prevHypos = hypo->GetPrevHypos();
|
||||
@ -743,11 +770,12 @@ size_t IOWrapper::OutputAlignment(Alignments &retAlign, const Moses::ChartHypoth
|
||||
|
||||
const ChartHypothesis *prevHypo = prevHypos[sourceInd];
|
||||
|
||||
// 1st. calc source size
|
||||
// calc source size
|
||||
size_t sourceSize = prevHypo->GetCurrSourceRange().GetNumWordsCovered();
|
||||
sourceOffsets[sourcePos] = sourceSize;
|
||||
|
||||
// 2nd. calc target size. Recursively look thru child hypos
|
||||
// calc target size.
|
||||
// Recursively look thru child hypos
|
||||
size_t currStartTarget = startTarget + totalTargetSize;
|
||||
size_t targetSize = OutputAlignment(retAlign, prevHypo, currStartTarget);
|
||||
targetOffsets[targetPos] = targetSize;
|
||||
@ -760,28 +788,28 @@ size_t IOWrapper::OutputAlignment(Alignments &retAlign, const Moses::ChartHypoth
|
||||
}
|
||||
}
|
||||
|
||||
// 3rd. shift offsets
|
||||
// convert position within translation rule to absolute position within
|
||||
// source sentence / output sentence
|
||||
ShiftOffsets(sourceOffsets, startSource);
|
||||
ShiftOffsets(targetOffsets, startTarget);
|
||||
|
||||
// get alignments from this hypo
|
||||
vector< set<size_t> > retAlignmentsS2T(hypo->GetCurrSourceRange().GetNumWordsCovered());
|
||||
const AlignmentInfo &aiTerm = hypo->GetCurrTargetPhrase().GetAlignTerm();
|
||||
OutputAlignment(retAlignmentsS2T, aiTerm);
|
||||
|
||||
// add to output arg, offsetting by source & target
|
||||
for (size_t source = 0; source < retAlignmentsS2T.size(); ++source) {
|
||||
const set<size_t> &targets = retAlignmentsS2T[source];
|
||||
set<size_t>::const_iterator iter;
|
||||
for (iter = targets.begin(); iter != targets.end(); ++iter) {
|
||||
size_t target = *iter;
|
||||
pair<size_t, size_t> alignPoint(source + sourceOffsets[source]
|
||||
,target + targetOffsets[target]);
|
||||
AlignmentInfo::const_iterator iter;
|
||||
for (iter = aiTerm.begin(); iter != aiTerm.end(); ++iter) {
|
||||
const std::pair<size_t,size_t> &align = *iter;
|
||||
size_t relSource = align.first;
|
||||
size_t relTarget = align.second;
|
||||
size_t absSource = sourceOffsets[relSource];
|
||||
size_t absTarget = targetOffsets[relTarget];
|
||||
|
||||
pair<size_t, size_t> alignPoint(absSource, absTarget);
|
||||
pair<Alignments::iterator, bool> ret = retAlign.insert(alignPoint);
|
||||
CHECK(ret.second);
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
return totalTargetSize;
|
||||
}
|
||||
|
@ -189,6 +189,15 @@ InputType*IOWrapper::GetInput(InputType* inputType)
|
||||
}
|
||||
}
|
||||
|
||||
ofstream* IOWrapper::GetOutputSearchGraphHypergraphWeightsStream() {
|
||||
const StaticData &staticData = StaticData::Instance();
|
||||
stringstream fileName;
|
||||
fileName << staticData.GetParam("output-search-graph-hypergraph")[1];
|
||||
std::ofstream *file = new std::ofstream;
|
||||
file->open(fileName.str().c_str());
|
||||
return file;
|
||||
}
|
||||
|
||||
/***
|
||||
* print surface factor only for the given phrase
|
||||
*/
|
||||
@ -262,6 +271,19 @@ void OutputAlignment(ostream &out, const vector<const Hypothesis *> &edges)
|
||||
out << std::endl;
|
||||
}
|
||||
|
||||
void OutputAlignment(std::ostream &out, const Moses::Hypothesis *hypo)
|
||||
{
|
||||
std::vector<const Hypothesis *> edges;
|
||||
const Hypothesis *currentHypo = hypo;
|
||||
while (currentHypo) {
|
||||
edges.push_back(currentHypo);
|
||||
currentHypo = currentHypo->GetPrevHypo();
|
||||
}
|
||||
|
||||
OutputAlignment(out, edges);
|
||||
|
||||
}
|
||||
|
||||
void OutputAlignment(OutputCollector* collector, size_t lineNo , const vector<const Hypothesis *> &edges)
|
||||
{
|
||||
ostringstream out;
|
||||
|
@ -117,6 +117,8 @@ public:
|
||||
return *m_outputSearchGraphStream;
|
||||
}
|
||||
|
||||
std::ofstream *GetOutputSearchGraphHypergraphWeightsStream();
|
||||
|
||||
std::ostream &GetDetailedTranslationReportingStream() {
|
||||
assert (m_detailedTranslationReportingStream);
|
||||
return *m_detailedTranslationReportingStream;
|
||||
@ -137,7 +139,7 @@ void OutputBestHypo(const Moses::TrellisPath &path, long /*translationId*/,bool
|
||||
void OutputInput(std::ostream& os, const Moses::Hypothesis* hypo);
|
||||
void OutputAlignment(Moses::OutputCollector* collector, size_t lineNo, const Moses::Hypothesis *hypo);
|
||||
void OutputAlignment(Moses::OutputCollector* collector, size_t lineNo, const Moses::TrellisPath &path);
|
||||
|
||||
void OutputAlignment(std::ostream &out, const Moses::Hypothesis *hypo);
|
||||
|
||||
}
|
||||
|
||||
|
@ -83,14 +83,18 @@ public:
|
||||
OutputCollector* wordGraphCollector, OutputCollector* searchGraphCollector,
|
||||
OutputCollector* detailedTranslationCollector,
|
||||
OutputCollector* alignmentInfoCollector,
|
||||
OutputCollector* unknownsCollector) :
|
||||
OutputCollector* unknownsCollector,
|
||||
bool outputSearchGraphSLF,
|
||||
bool outputSearchGraphHypergraph) :
|
||||
m_source(source), m_lineNumber(lineNumber),
|
||||
m_outputCollector(outputCollector), m_nbestCollector(nbestCollector),
|
||||
m_latticeSamplesCollector(latticeSamplesCollector),
|
||||
m_wordGraphCollector(wordGraphCollector), m_searchGraphCollector(searchGraphCollector),
|
||||
m_detailedTranslationCollector(detailedTranslationCollector),
|
||||
m_alignmentInfoCollector(alignmentInfoCollector),
|
||||
m_unknownsCollector(unknownsCollector) {}
|
||||
m_unknownsCollector(unknownsCollector),
|
||||
m_outputSearchGraphSLF(outputSearchGraphSLF),
|
||||
m_outputSearchGraphHypergraph(outputSearchGraphHypergraph) {}
|
||||
|
||||
/** Translate one sentence
|
||||
* gets called by main function implemented at end of this source file */
|
||||
@ -143,6 +147,42 @@ public:
|
||||
#endif
|
||||
}
|
||||
|
||||
// Output search graph in HTK standard lattice format (SLF)
|
||||
if (m_outputSearchGraphSLF) {
|
||||
stringstream fileName;
|
||||
fileName << staticData.GetParam("output-search-graph-slf")[0] << "/" << m_lineNumber << ".slf";
|
||||
std::ofstream *file = new std::ofstream;
|
||||
file->open(fileName.str().c_str());
|
||||
if (file->is_open() && file->good()) {
|
||||
ostringstream out;
|
||||
fix(out,PRECISION);
|
||||
manager.OutputSearchGraphAsSLF(m_lineNumber, out);
|
||||
*file << out.str();
|
||||
file -> flush();
|
||||
} else {
|
||||
TRACE_ERR("Cannot output HTK standard lattice for line " << m_lineNumber << " because the output file is not open or not ready for writing" << std::endl);
|
||||
}
|
||||
}
|
||||
|
||||
// Output search graph in hypergraph format for Kenneth Heafield's lazy hypergraph decoder
|
||||
if (m_outputSearchGraphHypergraph) {
|
||||
stringstream fileName;
|
||||
fileName << staticData.GetParam("output-search-graph-hypergraph")[0] << "/" << m_lineNumber;
|
||||
std::ofstream *file = new std::ofstream;
|
||||
file->open(fileName.str().c_str());
|
||||
if (file->is_open() && file->good()) {
|
||||
ostringstream out;
|
||||
fix(out,PRECISION);
|
||||
manager.OutputSearchGraphAsHypergraph(m_lineNumber, out);
|
||||
*file << out.str();
|
||||
file -> flush();
|
||||
} else {
|
||||
TRACE_ERR("Cannot output hypergraph for line " << m_lineNumber << " because the output file is not open or not ready for writing" << std::endl);
|
||||
}
|
||||
file -> close();
|
||||
delete file;
|
||||
}
|
||||
|
||||
// apply decision rule and output best translation(s)
|
||||
if (m_outputCollector) {
|
||||
ostringstream out;
|
||||
@ -174,6 +214,11 @@ public:
|
||||
staticData.GetOutputFactorOrder(),
|
||||
staticData.GetReportSegmentation(),
|
||||
staticData.GetReportAllFactors());
|
||||
if (staticData.PrintAlignmentInfo()) {
|
||||
out << "||| ";
|
||||
OutputAlignment(out, bestHypo);
|
||||
}
|
||||
|
||||
OutputAlignment(m_alignmentInfoCollector, m_lineNumber, bestHypo);
|
||||
IFVERBOSE(1) {
|
||||
debug << "BEST TRANSLATION: " << *bestHypo << endl;
|
||||
@ -311,6 +356,8 @@ private:
|
||||
OutputCollector* m_detailedTranslationCollector;
|
||||
OutputCollector* m_alignmentInfoCollector;
|
||||
OutputCollector* m_unknownsCollector;
|
||||
bool m_outputSearchGraphSLF;
|
||||
bool m_outputSearchGraphHypergraph;
|
||||
std::ofstream *m_alignmentStream;
|
||||
|
||||
|
||||
@ -367,6 +414,63 @@ static void ShowWeights()
|
||||
|
||||
}
|
||||
|
||||
size_t OutputFeatureWeightsForHypergraph(size_t index, const FeatureFunction* ff, std::ostream &outputSearchGraphStream)
|
||||
{
|
||||
size_t numScoreComps = ff->GetNumScoreComponents();
|
||||
if (numScoreComps != ScoreProducer::unlimited) {
|
||||
vector<float> values = StaticData::Instance().GetAllWeights().GetScoresForProducer(ff);
|
||||
if (numScoreComps > 1) {
|
||||
for (size_t i = 0; i < numScoreComps; ++i) {
|
||||
outputSearchGraphStream << ff->GetScoreProducerWeightShortName()
|
||||
<< i
|
||||
<< "=" << values[i] << endl;
|
||||
}
|
||||
} else {
|
||||
outputSearchGraphStream << ff->GetScoreProducerWeightShortName()
|
||||
<< "=" << values[0] << endl;
|
||||
}
|
||||
return index+numScoreComps;
|
||||
} else {
|
||||
cerr << "Sparse features are not yet supported when outputting hypergraph format" << endl;
|
||||
assert(false);
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
void OutputFeatureWeightsForHypergraph(std::ostream &outputSearchGraphStream)
|
||||
{
|
||||
outputSearchGraphStream.setf(std::ios::fixed);
|
||||
outputSearchGraphStream.precision(6);
|
||||
|
||||
const StaticData& staticData = StaticData::Instance();
|
||||
const TranslationSystem& system = staticData.GetTranslationSystem(TranslationSystem::DEFAULT);
|
||||
const vector<const StatelessFeatureFunction*>& slf =system.GetStatelessFeatureFunctions();
|
||||
const vector<const StatefulFeatureFunction*>& sff = system.GetStatefulFeatureFunctions();
|
||||
size_t featureIndex = 1;
|
||||
for (size_t i = 0; i < sff.size(); ++i) {
|
||||
featureIndex = OutputFeatureWeightsForHypergraph(featureIndex, sff[i], outputSearchGraphStream);
|
||||
}
|
||||
for (size_t i = 0; i < slf.size(); ++i) {
|
||||
if (slf[i]->GetScoreProducerWeightShortName() != "u" &&
|
||||
slf[i]->GetScoreProducerWeightShortName() != "tm" &&
|
||||
slf[i]->GetScoreProducerWeightShortName() != "I" &&
|
||||
slf[i]->GetScoreProducerWeightShortName() != "g")
|
||||
{
|
||||
featureIndex = OutputFeatureWeightsForHypergraph(featureIndex, slf[i], outputSearchGraphStream);
|
||||
}
|
||||
}
|
||||
const vector<PhraseDictionaryFeature*>& pds = system.GetPhraseDictionaries();
|
||||
for( size_t i=0; i<pds.size(); i++ ) {
|
||||
featureIndex = OutputFeatureWeightsForHypergraph(featureIndex, pds[i], outputSearchGraphStream);
|
||||
}
|
||||
const vector<GenerationDictionary*>& gds = system.GetGenerationDictionaries();
|
||||
for( size_t i=0; i<gds.size(); i++ ) {
|
||||
featureIndex = OutputFeatureWeightsForHypergraph(featureIndex, gds[i], outputSearchGraphStream);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
} //namespace
|
||||
|
||||
/** main function of the command line version of the decoder **/
|
||||
@ -391,20 +495,20 @@ int main(int argc, char** argv)
|
||||
|
||||
// load all the settings into the Parameter class
|
||||
// (stores them as strings, or array of strings)
|
||||
Parameter* params = new Parameter();
|
||||
if (!params->LoadParam(argc,argv)) {
|
||||
Parameter params;
|
||||
if (!params.LoadParam(argc,argv)) {
|
||||
exit(1);
|
||||
}
|
||||
|
||||
|
||||
// initialize all "global" variables, which are stored in StaticData
|
||||
// note: this also loads models such as the language model, etc.
|
||||
if (!StaticData::LoadDataStatic(params, argv[0])) {
|
||||
if (!StaticData::LoadDataStatic(¶ms, argv[0])) {
|
||||
exit(1);
|
||||
}
|
||||
|
||||
// setting "-show-weights" -> just dump out weights and exit
|
||||
if (params->isParamSpecified("show-weights")) {
|
||||
if (params.isParamSpecified("show-weights")) {
|
||||
ShowWeights();
|
||||
exit(0);
|
||||
}
|
||||
@ -430,6 +534,14 @@ int main(int argc, char** argv)
|
||||
TRACE_ERR(weights);
|
||||
TRACE_ERR("\n");
|
||||
}
|
||||
if (staticData.GetOutputSearchGraphHypergraph() && staticData.GetParam("output-search-graph-hypergraph").size() > 1) {
|
||||
ofstream* weightsOut = ioWrapper->GetOutputSearchGraphHypergraphWeightsStream();
|
||||
OutputFeatureWeightsForHypergraph(*weightsOut);
|
||||
weightsOut->flush();
|
||||
weightsOut->close();
|
||||
delete weightsOut;
|
||||
}
|
||||
|
||||
|
||||
// initialize output streams
|
||||
// note: we can't just write to STDOUT or files
|
||||
@ -533,7 +645,9 @@ int main(int argc, char** argv)
|
||||
searchGraphCollector.get(),
|
||||
detailedTranslationCollector.get(),
|
||||
alignmentInfoCollector.get(),
|
||||
unknownsCollector.get() );
|
||||
unknownsCollector.get(),
|
||||
staticData.GetOutputSearchGraphSLF(),
|
||||
staticData.GetOutputSearchGraphHypergraph());
|
||||
// execute task
|
||||
#ifdef WITH_THREADS
|
||||
pool.Submit(task);
|
||||
@ -551,6 +665,8 @@ int main(int argc, char** argv)
|
||||
pool.Stop(true); //flush remaining jobs
|
||||
#endif
|
||||
|
||||
delete ioWrapper;
|
||||
|
||||
} catch (const std::exception &e) {
|
||||
std::cerr << "Exception: " << e.what() << std::endl;
|
||||
return EXIT_FAILURE;
|
||||
|
@ -30,6 +30,9 @@ AlignmentInfoCollection::AlignmentInfoCollection()
|
||||
m_emptyAlignmentInfo = Add(pairs);
|
||||
}
|
||||
|
||||
AlignmentInfoCollection::~AlignmentInfoCollection()
|
||||
{}
|
||||
|
||||
const AlignmentInfo &AlignmentInfoCollection::GetEmptyAlignmentInfo() const
|
||||
{
|
||||
return *m_emptyAlignmentInfo;
|
||||
|
@ -55,6 +55,7 @@ class AlignmentInfoCollection
|
||||
|
||||
//! Only a single static variable should be created.
|
||||
AlignmentInfoCollection();
|
||||
~AlignmentInfoCollection();
|
||||
|
||||
static AlignmentInfoCollection s_instance;
|
||||
|
||||
|
@ -462,7 +462,7 @@ void Hypothesis::CleanupArcList()
|
||||
*/
|
||||
const StaticData &staticData = StaticData::Instance();
|
||||
size_t nBestSize = staticData.GetNBestSize();
|
||||
bool distinctNBest = staticData.GetDistinctNBest() || staticData.UseMBR() || staticData.GetOutputSearchGraph() || staticData.UseLatticeMBR() ;
|
||||
bool distinctNBest = staticData.GetDistinctNBest() || staticData.UseMBR() || staticData.GetOutputSearchGraph() || staticData.GetOutputSearchGraphSLF() || staticData.GetOutputSearchGraphHypergraph() || staticData.UseLatticeMBR() ;
|
||||
|
||||
if (!distinctNBest && m_arcList->size() > nBestSize * 5) {
|
||||
// prune arc list only if there too many arcs
|
||||
|
@ -36,8 +36,9 @@ using namespace std;
|
||||
namespace Moses
|
||||
{
|
||||
|
||||
LanguageModelSingleFactor::~LanguageModelSingleFactor() {}
|
||||
|
||||
LanguageModelSingleFactor::~LanguageModelSingleFactor()
|
||||
{
|
||||
}
|
||||
|
||||
struct PointerState : public FFState {
|
||||
const void* lmstate;
|
||||
@ -58,7 +59,11 @@ LanguageModelPointerState::LanguageModelPointerState()
|
||||
m_beginSentenceState = new PointerState(NULL);
|
||||
}
|
||||
|
||||
LanguageModelPointerState::~LanguageModelPointerState() {}
|
||||
LanguageModelPointerState::~LanguageModelPointerState()
|
||||
{
|
||||
delete m_nullContextState;
|
||||
delete m_beginSentenceState;
|
||||
}
|
||||
|
||||
const FFState *LanguageModelPointerState::GetNullContextState() const
|
||||
{
|
||||
|
@ -26,8 +26,10 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
#endif
|
||||
|
||||
#include <algorithm>
|
||||
#include <limits>
|
||||
#include <cmath>
|
||||
#include <limits>
|
||||
#include <map>
|
||||
#include <set>
|
||||
#include "Manager.h"
|
||||
#include "TypeDef.h"
|
||||
#include "Util.h"
|
||||
@ -46,17 +48,19 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
#include "rule.pb.h"
|
||||
#endif
|
||||
|
||||
#include "util/exception.hh"
|
||||
|
||||
using namespace std;
|
||||
|
||||
namespace Moses
|
||||
{
|
||||
Manager::Manager(size_t lineNumber, InputType const& source, SearchAlgorithm searchAlgorithm, const TranslationSystem* system)
|
||||
:m_lineNumber(lineNumber)
|
||||
,m_system(system)
|
||||
:m_system(system)
|
||||
,m_transOptColl(source.CreateTranslationOptionCollection(system))
|
||||
,m_search(Search::CreateSearch(*this, source, searchAlgorithm, *m_transOptColl))
|
||||
,interrupted_flag(0)
|
||||
,m_hypoId(0)
|
||||
,m_lineNumber(lineNumber)
|
||||
,m_source(source)
|
||||
{
|
||||
m_system->InitializeBeforeSentenceProcessing(source);
|
||||
@ -628,6 +632,420 @@ void Manager::GetSearchGraph(vector<SearchGraphNode>& searchGraph) const
|
||||
|
||||
}
|
||||
|
||||
void Manager::OutputFeatureWeightsForSLF(std::ostream &outputSearchGraphStream) const
|
||||
{
|
||||
outputSearchGraphStream.setf(std::ios::fixed);
|
||||
outputSearchGraphStream.precision(6);
|
||||
|
||||
const StaticData& staticData = StaticData::Instance();
|
||||
const TranslationSystem& system = staticData.GetTranslationSystem(TranslationSystem::DEFAULT);
|
||||
const vector<const StatelessFeatureFunction*>& slf =system.GetStatelessFeatureFunctions();
|
||||
const vector<const StatefulFeatureFunction*>& sff = system.GetStatefulFeatureFunctions();
|
||||
size_t featureIndex = 1;
|
||||
for (size_t i = 0; i < sff.size(); ++i) {
|
||||
featureIndex = OutputFeatureWeightsForSLF(featureIndex, sff[i], outputSearchGraphStream);
|
||||
}
|
||||
for (size_t i = 0; i < slf.size(); ++i) {
|
||||
if (slf[i]->GetScoreProducerWeightShortName() != "u" &&
|
||||
slf[i]->GetScoreProducerWeightShortName() != "tm" &&
|
||||
slf[i]->GetScoreProducerWeightShortName() != "I" &&
|
||||
slf[i]->GetScoreProducerWeightShortName() != "g")
|
||||
{
|
||||
featureIndex = OutputFeatureWeightsForSLF(featureIndex, slf[i], outputSearchGraphStream);
|
||||
}
|
||||
}
|
||||
const vector<PhraseDictionaryFeature*>& pds = system.GetPhraseDictionaries();
|
||||
for( size_t i=0; i<pds.size(); i++ ) {
|
||||
featureIndex = OutputFeatureWeightsForSLF(featureIndex, pds[i], outputSearchGraphStream);
|
||||
}
|
||||
const vector<GenerationDictionary*>& gds = system.GetGenerationDictionaries();
|
||||
for( size_t i=0; i<gds.size(); i++ ) {
|
||||
featureIndex = OutputFeatureWeightsForSLF(featureIndex, gds[i], outputSearchGraphStream);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
void Manager::OutputFeatureValuesForSLF(const Hypothesis* hypo, bool zeros, std::ostream &outputSearchGraphStream) const
|
||||
{
|
||||
outputSearchGraphStream.setf(std::ios::fixed);
|
||||
outputSearchGraphStream.precision(6);
|
||||
|
||||
// outputSearchGraphStream << endl;
|
||||
// outputSearchGraphStream << (*hypo) << endl;
|
||||
// const ScoreComponentCollection& scoreCollection = hypo->GetScoreBreakdown();
|
||||
// outputSearchGraphStream << scoreCollection << endl;
|
||||
|
||||
const StaticData& staticData = StaticData::Instance();
|
||||
const TranslationSystem& system = staticData.GetTranslationSystem(TranslationSystem::DEFAULT);
|
||||
const vector<const StatelessFeatureFunction*>& slf =system.GetStatelessFeatureFunctions();
|
||||
const vector<const StatefulFeatureFunction*>& sff = system.GetStatefulFeatureFunctions();
|
||||
size_t featureIndex = 1;
|
||||
for (size_t i = 0; i < sff.size(); ++i) {
|
||||
featureIndex = OutputFeatureValuesForSLF(featureIndex, zeros, hypo, sff[i], outputSearchGraphStream);
|
||||
}
|
||||
for (size_t i = 0; i < slf.size(); ++i) {
|
||||
if (slf[i]->GetScoreProducerWeightShortName() != "u" &&
|
||||
slf[i]->GetScoreProducerWeightShortName() != "tm" &&
|
||||
slf[i]->GetScoreProducerWeightShortName() != "I" &&
|
||||
slf[i]->GetScoreProducerWeightShortName() != "g")
|
||||
{
|
||||
featureIndex = OutputFeatureValuesForSLF(featureIndex, zeros, hypo, slf[i], outputSearchGraphStream);
|
||||
}
|
||||
}
|
||||
const vector<PhraseDictionaryFeature*>& pds = system.GetPhraseDictionaries();
|
||||
for( size_t i=0; i<pds.size(); i++ ) {
|
||||
featureIndex = OutputFeatureValuesForSLF(featureIndex, zeros, hypo, pds[i], outputSearchGraphStream);
|
||||
}
|
||||
const vector<GenerationDictionary*>& gds = system.GetGenerationDictionaries();
|
||||
for( size_t i=0; i<gds.size(); i++ ) {
|
||||
featureIndex = OutputFeatureValuesForSLF(featureIndex, zeros, hypo, gds[i], outputSearchGraphStream);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
void Manager::OutputFeatureValuesForHypergraph(const Hypothesis* hypo, std::ostream &outputSearchGraphStream) const
|
||||
{
|
||||
outputSearchGraphStream.setf(std::ios::fixed);
|
||||
outputSearchGraphStream.precision(6);
|
||||
|
||||
const StaticData& staticData = StaticData::Instance();
|
||||
const TranslationSystem& system = staticData.GetTranslationSystem(TranslationSystem::DEFAULT);
|
||||
const vector<const StatelessFeatureFunction*>& slf =system.GetStatelessFeatureFunctions();
|
||||
const vector<const StatefulFeatureFunction*>& sff = system.GetStatefulFeatureFunctions();
|
||||
size_t featureIndex = 1;
|
||||
for (size_t i = 0; i < sff.size(); ++i) {
|
||||
featureIndex = OutputFeatureValuesForHypergraph(featureIndex, hypo, sff[i], outputSearchGraphStream);
|
||||
}
|
||||
for (size_t i = 0; i < slf.size(); ++i) {
|
||||
if (slf[i]->GetScoreProducerWeightShortName() != "u" &&
|
||||
slf[i]->GetScoreProducerWeightShortName() != "tm" &&
|
||||
slf[i]->GetScoreProducerWeightShortName() != "I" &&
|
||||
slf[i]->GetScoreProducerWeightShortName() != "g")
|
||||
{
|
||||
featureIndex = OutputFeatureValuesForHypergraph(featureIndex, hypo, slf[i], outputSearchGraphStream);
|
||||
}
|
||||
}
|
||||
const vector<PhraseDictionaryFeature*>& pds = system.GetPhraseDictionaries();
|
||||
for( size_t i=0; i<pds.size(); i++ ) {
|
||||
featureIndex = OutputFeatureValuesForHypergraph(featureIndex, hypo, pds[i], outputSearchGraphStream);
|
||||
}
|
||||
const vector<GenerationDictionary*>& gds = system.GetGenerationDictionaries();
|
||||
for( size_t i=0; i<gds.size(); i++ ) {
|
||||
featureIndex = OutputFeatureValuesForHypergraph(featureIndex, hypo, gds[i], outputSearchGraphStream);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
size_t Manager::OutputFeatureWeightsForSLF(size_t index, const FeatureFunction* ff, std::ostream &outputSearchGraphStream) const
|
||||
{
|
||||
size_t numScoreComps = ff->GetNumScoreComponents();
|
||||
if (numScoreComps != ScoreProducer::unlimited) {
|
||||
vector<float> values = StaticData::Instance().GetAllWeights().GetScoresForProducer(ff);
|
||||
for (size_t i = 0; i < numScoreComps; ++i) {
|
||||
outputSearchGraphStream << "# " << ff->GetScoreProducerDescription()
|
||||
<< " " << ff->GetScoreProducerWeightShortName()
|
||||
<< " " << (i+1) << " of " << numScoreComps << endl
|
||||
<< "x" << (index+i) << "scale=" << values[i] << endl;
|
||||
}
|
||||
return index+numScoreComps;
|
||||
} else {
|
||||
cerr << "Sparse features are not supported when outputting HTK standard lattice format" << endl;
|
||||
assert(false);
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
size_t Manager::OutputFeatureValuesForSLF(size_t index, bool zeros, const Hypothesis* hypo, const FeatureFunction* ff, std::ostream &outputSearchGraphStream) const
|
||||
{
|
||||
|
||||
// { const FeatureFunction* sp = ff;
|
||||
// const FVector& m_scores = scoreCollection.GetScoresVector();
|
||||
// FVector& scores = const_cast<FVector&>(m_scores);
|
||||
// std::string prefix = sp->GetScoreProducerDescription() + FName::SEP;
|
||||
// // std::cout << "prefix==" << prefix << endl;
|
||||
// // cout << "m_scores==" << m_scores << endl;
|
||||
// // cout << "m_scores.size()==" << m_scores.size() << endl;
|
||||
// // cout << "m_scores.coreSize()==" << m_scores.coreSize() << endl;
|
||||
// // cout << "m_scores.cbegin() ?= m_scores.cend()\t" << (m_scores.cbegin() == m_scores.cend()) << endl;
|
||||
|
||||
|
||||
// // for(FVector::FNVmap::const_iterator i = m_scores.cbegin(); i != m_scores.cend(); i++) {
|
||||
// // std::cout<<prefix << "\t" << (i->first) << "\t" << (i->second) << std::endl;
|
||||
// // }
|
||||
// for(int i=0, n=v.size(); i<n; i+=1) {
|
||||
// // outputSearchGraphStream << prefix << i << "==" << v[i] << std::endl;
|
||||
|
||||
// }
|
||||
// }
|
||||
|
||||
// FVector featureValues = scoreCollection.GetVectorForProducer(ff);
|
||||
// outputSearchGraphStream << featureValues << endl;
|
||||
const ScoreComponentCollection& scoreCollection = hypo->GetScoreBreakdown();
|
||||
|
||||
vector<float> featureValues = scoreCollection.GetScoresForProducer(ff);
|
||||
size_t numScoreComps = featureValues.size();//featureValues.coreSize();
|
||||
// if (numScoreComps != ScoreProducer::unlimited) {
|
||||
// vector<float> values = StaticData::Instance().GetAllWeights().GetScoresForProducer(ff);
|
||||
for (size_t i = 0; i < numScoreComps; ++i) {
|
||||
outputSearchGraphStream << "x" << (index+i) << "=" << ((zeros) ? 0.0 : featureValues[i]) << " ";
|
||||
}
|
||||
return index+numScoreComps;
|
||||
// } else {
|
||||
// cerr << "Sparse features are not supported when outputting HTK standard lattice format" << endl;
|
||||
// assert(false);
|
||||
// return 0;
|
||||
// }
|
||||
}
|
||||
|
||||
size_t Manager::OutputFeatureValuesForHypergraph(size_t index, const Hypothesis* hypo, const FeatureFunction* ff, std::ostream &outputSearchGraphStream) const
|
||||
{
|
||||
|
||||
ScoreComponentCollection scoreCollection = hypo->GetScoreBreakdown();
|
||||
const Hypothesis *prevHypo = hypo->GetPrevHypo();
|
||||
if (prevHypo) {
|
||||
scoreCollection.MinusEquals( prevHypo->GetScoreBreakdown() );
|
||||
}
|
||||
vector<float> featureValues = scoreCollection.GetScoresForProducer(ff);
|
||||
size_t numScoreComps = featureValues.size();
|
||||
|
||||
if (numScoreComps > 1) {
|
||||
for (size_t i = 0; i < numScoreComps; ++i) {
|
||||
outputSearchGraphStream << ff->GetScoreProducerWeightShortName() << i << "=" << featureValues[i] << " ";
|
||||
}
|
||||
} else {
|
||||
outputSearchGraphStream << ff->GetScoreProducerWeightShortName() << "=" << featureValues[0] << " ";
|
||||
}
|
||||
|
||||
return index+numScoreComps;
|
||||
}
|
||||
|
||||
/**! Output search graph in hypergraph format of Kenneth Heafield's lazy hypergraph decoder */
|
||||
void Manager::OutputSearchGraphAsHypergraph(long translationId, std::ostream &outputSearchGraphStream) const
|
||||
{
|
||||
vector<SearchGraphNode> searchGraph;
|
||||
GetSearchGraph(searchGraph);
|
||||
|
||||
map<int,int> mosesIDToHypergraphID;
|
||||
// map<int,int> hypergraphIDToMosesID;
|
||||
set<int> terminalNodes;
|
||||
multimap<int,int> hypergraphIDToArcs;
|
||||
|
||||
long numNodes = 0;
|
||||
long endNode = 0;
|
||||
{
|
||||
long hypergraphHypothesisID = 0;
|
||||
for (size_t arcNumber = 0, size=searchGraph.size(); arcNumber < size; ++arcNumber) {
|
||||
|
||||
// Get an id number for the previous hypothesis
|
||||
const Hypothesis *prevHypo = searchGraph[arcNumber].hypo->GetPrevHypo();
|
||||
if (prevHypo!=NULL) {
|
||||
int mosesPrevHypothesisID = prevHypo->GetId();
|
||||
if (mosesIDToHypergraphID.count(mosesPrevHypothesisID) == 0) {
|
||||
mosesIDToHypergraphID[mosesPrevHypothesisID] = hypergraphHypothesisID;
|
||||
// hypergraphIDToMosesID[hypergraphHypothesisID] = mosesPrevHypothesisID;
|
||||
hypergraphHypothesisID += 1;
|
||||
}
|
||||
}
|
||||
|
||||
// Get an id number for this hypothesis
|
||||
int mosesHypothesisID;
|
||||
if (searchGraph[arcNumber].recombinationHypo) {
|
||||
mosesHypothesisID = searchGraph[arcNumber].recombinationHypo->GetId();
|
||||
} else {
|
||||
mosesHypothesisID = searchGraph[arcNumber].hypo->GetId();
|
||||
}
|
||||
|
||||
if (mosesIDToHypergraphID.count(mosesHypothesisID) == 0) {
|
||||
|
||||
mosesIDToHypergraphID[mosesHypothesisID] = hypergraphHypothesisID;
|
||||
// hypergraphIDToMosesID[hypergraphHypothesisID] = mosesHypothesisID;
|
||||
|
||||
bool terminalNode = (searchGraph[arcNumber].forward == -1);
|
||||
if (terminalNode) {
|
||||
// Final arc to end node, representing the end of the sentence </s>
|
||||
terminalNodes.insert(hypergraphHypothesisID);
|
||||
}
|
||||
|
||||
hypergraphHypothesisID += 1;
|
||||
}
|
||||
|
||||
// Record that this arc ends at this node
|
||||
hypergraphIDToArcs.insert(pair<int,int>(mosesIDToHypergraphID[mosesHypothesisID],arcNumber));
|
||||
|
||||
}
|
||||
|
||||
// Unique end node
|
||||
endNode = hypergraphHypothesisID;
|
||||
// mosesIDToHypergraphID[hypergraphHypothesisID] = hypergraphHypothesisID;
|
||||
numNodes = endNode + 1;
|
||||
|
||||
}
|
||||
|
||||
|
||||
long numArcs = searchGraph.size() + terminalNodes.size();
|
||||
|
||||
// Print number of nodes and arcs
|
||||
outputSearchGraphStream << numNodes << " " << numArcs << endl;
|
||||
|
||||
for (int hypergraphHypothesisID=0; hypergraphHypothesisID < endNode; hypergraphHypothesisID+=1) {
|
||||
// int mosesID = hypergraphIDToMosesID[hypergraphHypothesisID];
|
||||
size_t count = hypergraphIDToArcs.count(hypergraphHypothesisID);
|
||||
if (count > 0) {
|
||||
outputSearchGraphStream << count << endl;
|
||||
|
||||
pair<multimap<int,int>::iterator, multimap<int,int>::iterator> range =
|
||||
hypergraphIDToArcs.equal_range(hypergraphHypothesisID);
|
||||
for (multimap<int,int>::iterator it=range.first; it!=range.second; ++it) {
|
||||
int lineNumber = (*it).second;
|
||||
const Hypothesis *thisHypo = searchGraph[lineNumber].hypo;
|
||||
int mosesHypothesisID;// = thisHypo->GetId();
|
||||
if (searchGraph[lineNumber].recombinationHypo) {
|
||||
mosesHypothesisID = searchGraph[lineNumber].recombinationHypo->GetId();
|
||||
} else {
|
||||
mosesHypothesisID = searchGraph[lineNumber].hypo->GetId();
|
||||
}
|
||||
// int actualHypergraphHypothesisID = mosesIDToHypergraphID[mosesHypothesisID];
|
||||
UTIL_THROW_IF(
|
||||
(hypergraphHypothesisID != mosesIDToHypergraphID[mosesHypothesisID]),
|
||||
util::Exception,
|
||||
"Error while writing search lattice as hypergraph for sentence " << translationId << ". " <<
|
||||
"Moses node " << mosesHypothesisID << " was expected to have hypergraph id " << hypergraphHypothesisID <<
|
||||
", but actually had hypergraph id " << mosesIDToHypergraphID[mosesHypothesisID] <<
|
||||
". There are " << numNodes << " nodes in the search lattice."
|
||||
);
|
||||
|
||||
const Hypothesis *prevHypo = thisHypo->GetPrevHypo();
|
||||
if (prevHypo==NULL) {
|
||||
outputSearchGraphStream << "<s> ||| " << endl;
|
||||
} else {
|
||||
int startNode = mosesIDToHypergraphID[prevHypo->GetId()];
|
||||
|
||||
UTIL_THROW_IF(
|
||||
(startNode >= hypergraphHypothesisID),
|
||||
util::Exception,
|
||||
"Error while writing search lattice as hypergraph for sentence" << translationId << ". " <<
|
||||
"The nodes must be output in topological order. The code attempted to violate this restriction."
|
||||
);
|
||||
|
||||
const TargetPhrase &targetPhrase = thisHypo->GetCurrTargetPhrase();
|
||||
int targetWordCount = targetPhrase.GetSize();
|
||||
|
||||
outputSearchGraphStream << "[" << startNode << "]";
|
||||
for (int targetWordIndex=0; targetWordIndex<targetWordCount; targetWordIndex+=1) {
|
||||
outputSearchGraphStream << " " << targetPhrase.GetWord(targetWordIndex);
|
||||
}
|
||||
outputSearchGraphStream << " ||| ";
|
||||
OutputFeatureValuesForHypergraph(thisHypo, outputSearchGraphStream);
|
||||
outputSearchGraphStream << endl;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Print node and arc(s) for end of sentence </s>
|
||||
outputSearchGraphStream << terminalNodes.size() << endl;
|
||||
for (set<int>::iterator it=terminalNodes.begin(); it!=terminalNodes.end(); ++it) {
|
||||
outputSearchGraphStream << "[" << (*it) << "] </s> ||| " << endl;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
/**! Output search graph in HTK standard lattice format (SLF) */
|
||||
void Manager::OutputSearchGraphAsSLF(long translationId, std::ostream &outputSearchGraphStream) const
|
||||
{
|
||||
|
||||
vector<SearchGraphNode> searchGraph;
|
||||
GetSearchGraph(searchGraph);
|
||||
|
||||
long numArcs = 0;
|
||||
long numNodes = 0;
|
||||
|
||||
map<int,int> nodes;
|
||||
set<int> terminalNodes;
|
||||
|
||||
// Unique start node
|
||||
nodes[0] = 0;
|
||||
|
||||
for (size_t arcNumber = 0; arcNumber < searchGraph.size(); ++arcNumber) {
|
||||
|
||||
int targetWordCount = searchGraph[arcNumber].hypo->GetCurrTargetPhrase().GetSize();
|
||||
numArcs += targetWordCount;
|
||||
|
||||
int hypothesisID = searchGraph[arcNumber].hypo->GetId();
|
||||
if (nodes.count(hypothesisID) == 0) {
|
||||
|
||||
numNodes += targetWordCount;
|
||||
nodes[hypothesisID] = numNodes;
|
||||
//numNodes += 1;
|
||||
|
||||
bool terminalNode = (searchGraph[arcNumber].forward == -1);
|
||||
if (terminalNode) {
|
||||
numArcs += 1;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
numNodes += 1;
|
||||
|
||||
// Unique end node
|
||||
nodes[numNodes] = numNodes;
|
||||
|
||||
outputSearchGraphStream << "UTTERANCE=Sentence_" << translationId << endl;
|
||||
outputSearchGraphStream << "VERSION=1.1" << endl;
|
||||
outputSearchGraphStream << "base=2.71828182845905" << endl;
|
||||
outputSearchGraphStream << "NODES=" << (numNodes+1) << endl;
|
||||
outputSearchGraphStream << "LINKS=" << numArcs << endl;
|
||||
|
||||
OutputFeatureWeightsForSLF(outputSearchGraphStream);
|
||||
|
||||
for (size_t arcNumber = 0, lineNumber = 0; lineNumber < searchGraph.size(); ++lineNumber) {
|
||||
const Hypothesis *thisHypo = searchGraph[lineNumber].hypo;
|
||||
const Hypothesis *prevHypo = thisHypo->GetPrevHypo();
|
||||
if (prevHypo) {
|
||||
|
||||
int startNode = nodes[prevHypo->GetId()];
|
||||
int endNode = nodes[thisHypo->GetId()];
|
||||
bool terminalNode = (searchGraph[lineNumber].forward == -1);
|
||||
const TargetPhrase &targetPhrase = thisHypo->GetCurrTargetPhrase();
|
||||
int targetWordCount = targetPhrase.GetSize();
|
||||
|
||||
for (int targetWordIndex=0; targetWordIndex<targetWordCount; targetWordIndex+=1) {
|
||||
int x = (targetWordCount-targetWordIndex);
|
||||
|
||||
outputSearchGraphStream << "J=" << arcNumber;
|
||||
|
||||
if (targetWordIndex==0) {
|
||||
outputSearchGraphStream << " S=" << startNode;
|
||||
} else {
|
||||
outputSearchGraphStream << " S=" << endNode - x;
|
||||
}
|
||||
|
||||
outputSearchGraphStream << " E=" << endNode - (x-1)
|
||||
<< " W=" << targetPhrase.GetWord(targetWordIndex);
|
||||
|
||||
OutputFeatureValuesForSLF(thisHypo, (targetWordIndex>0), outputSearchGraphStream);
|
||||
|
||||
outputSearchGraphStream << endl;
|
||||
|
||||
arcNumber += 1;
|
||||
}
|
||||
|
||||
if (terminalNode && terminalNodes.count(endNode) == 0) {
|
||||
terminalNodes.insert(endNode);
|
||||
outputSearchGraphStream << "J=" << arcNumber
|
||||
<< " S=" << endNode
|
||||
<< " E=" << numNodes
|
||||
<< endl;
|
||||
arcNumber += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
void OutputSearchNode(long translationId, std::ostream &outputSearchGraphStream,
|
||||
const SearchGraphNode& searchNode)
|
||||
{
|
||||
|
@ -93,6 +93,19 @@ class Manager
|
||||
Manager(Manager const&);
|
||||
void operator=(Manager const&);
|
||||
const TranslationSystem* m_system;
|
||||
private:
|
||||
|
||||
// Helper functions to output search graph in HTK standard lattice format
|
||||
void OutputFeatureWeightsForSLF(std::ostream &outputSearchGraphStream) const;
|
||||
size_t OutputFeatureWeightsForSLF(size_t index, const FeatureFunction* ff, std::ostream &outputSearchGraphStream) const;
|
||||
void OutputFeatureValuesForSLF(const Hypothesis* hypo, bool zeros, std::ostream &outputSearchGraphStream) const;
|
||||
size_t OutputFeatureValuesForSLF(size_t index, bool zeros, const Hypothesis* hypo, const FeatureFunction* ff, std::ostream &outputSearchGraphStream) const;
|
||||
|
||||
// Helper functions to output search graph in the hypergraph format of Kenneth Heafield's lazy hypergraph decoder
|
||||
void OutputFeatureValuesForHypergraph(const Hypothesis* hypo, std::ostream &outputSearchGraphStream) const;
|
||||
size_t OutputFeatureValuesForHypergraph(size_t index, const Hypothesis* hypo, const FeatureFunction* ff, std::ostream &outputSearchGraphStream) const;
|
||||
|
||||
|
||||
protected:
|
||||
// data
|
||||
// InputType const& m_source; /**< source sentence to be translated */
|
||||
@ -103,6 +116,7 @@ protected:
|
||||
size_t interrupted_flag;
|
||||
std::auto_ptr<SentenceStats> m_sentenceStats;
|
||||
int m_hypoId; //used to number the hypos as they are created.
|
||||
size_t m_lineNumber;
|
||||
|
||||
void GetConnectedGraph(
|
||||
std::map< int, bool >* pConnected,
|
||||
@ -113,7 +127,6 @@ protected:
|
||||
|
||||
|
||||
public:
|
||||
size_t m_lineNumber;
|
||||
InputType const& m_source; /**< source sentence to be translated */
|
||||
Manager(size_t lineNumber, InputType const& source, SearchAlgorithm searchAlgorithm, const TranslationSystem* system);
|
||||
~Manager();
|
||||
@ -137,6 +150,8 @@ public:
|
||||
#endif
|
||||
|
||||
void OutputSearchGraph(long translationId, std::ostream &outputSearchGraphStream) const;
|
||||
void OutputSearchGraphAsSLF(long translationId, std::ostream &outputSearchGraphStream) const;
|
||||
void OutputSearchGraphAsHypergraph(long translationId, std::ostream &outputSearchGraphStream) const;
|
||||
void GetSearchGraph(std::vector<SearchGraphNode>& searchGraph) const;
|
||||
const InputType& GetSource() const {
|
||||
return m_source;
|
||||
|
@ -130,6 +130,8 @@ Parameter::Parameter()
|
||||
AddParam("output-search-graph", "osg", "Output connected hypotheses of search into specified filename");
|
||||
AddParam("output-search-graph-extended", "osgx", "Output connected hypotheses of search into specified filename, in extended format");
|
||||
AddParam("unpruned-search-graph", "usg", "When outputting chart search graph, do not exclude dead ends. Note: stack pruning may have eliminated some hypotheses");
|
||||
AddParam("output-search-graph-slf", "slf", "Output connected hypotheses of search into specified directory, one file per sentence, in HTK standard lattice format (SLF)");
|
||||
AddParam("output-search-graph-hypergraph", "Output connected hypotheses of search into specified directory, one file per sentence, in a hypergraph format (see Kenneth Heafield's lazy hypergraph decoder)");
|
||||
AddParam("include-lhs-in-search-graph", "lhssg", "When outputting chart search graph, include the label of the LHS of the rule (useful when using syntax)");
|
||||
#ifdef HAVE_PROTOBUF
|
||||
AddParam("output-search-graph-pb", "pb", "Write phrase lattice to protocol buffer objects in the specified path.");
|
||||
@ -177,6 +179,7 @@ Parameter::Parameter()
|
||||
AddParam("minlexr-memory", "Load lexical reordering table in minlexr format into memory");
|
||||
AddParam("minphr-memory", "Load phrase table in minphr format into memory");
|
||||
|
||||
AddParam("print-alignment-info", "Output word-to-word alignment into the log file. Word-to-word alignments are takne from the phrase table if any. Default is false");
|
||||
AddParam("include-segmentation-in-n-best", "include phrasal segmentation in the n-best list. default is false");
|
||||
AddParam("print-alignment-info-in-n-best", "Include word-to-word alignment in the n-best list. Word-to-word alignments are takne from the phrase table if any. Default is false");
|
||||
AddParam("alignment-output-file", "print output word alignments into given file");
|
||||
|
@ -162,10 +162,6 @@ bool StaticData::LoadData(Parameter *parameter)
|
||||
}
|
||||
}
|
||||
|
||||
if(m_parameter->GetParam("sort-word-alignment").size()) {
|
||||
m_wordAlignmentSort = (WordAlignmentSort) Scan<size_t>(m_parameter->GetParam("sort-word-alignment")[0]);
|
||||
}
|
||||
|
||||
// factor delimiter
|
||||
if (m_parameter->GetParam("factor-delimiter").size() > 0) {
|
||||
m_factorDelimiter = m_parameter->GetParam("factor-delimiter")[0];
|
||||
@ -175,6 +171,16 @@ bool StaticData::LoadData(Parameter *parameter)
|
||||
SetBooleanParameter( &m_outputHypoScore, "output-hypo-score", false );
|
||||
|
||||
//word-to-word alignment
|
||||
// alignments
|
||||
SetBooleanParameter( &m_PrintAlignmentInfo, "print-alignment-info", false );
|
||||
if (m_PrintAlignmentInfo) {
|
||||
m_needAlignmentInfo = true;
|
||||
}
|
||||
|
||||
if(m_parameter->GetParam("sort-word-alignment").size()) {
|
||||
m_wordAlignmentSort = (WordAlignmentSort) Scan<size_t>(m_parameter->GetParam("sort-word-alignment")[0]);
|
||||
}
|
||||
|
||||
SetBooleanParameter( &m_PrintAlignmentInfoNbest, "print-alignment-info-in-n-best", false );
|
||||
if (m_PrintAlignmentInfoNbest) {
|
||||
m_needAlignmentInfo = true;
|
||||
@ -235,8 +241,19 @@ bool StaticData::LoadData(Parameter *parameter)
|
||||
}
|
||||
m_outputSearchGraph = true;
|
||||
m_outputSearchGraphExtended = true;
|
||||
} else
|
||||
} else {
|
||||
m_outputSearchGraph = false;
|
||||
}
|
||||
if (m_parameter->GetParam("output-search-graph-slf").size() > 0) {
|
||||
m_outputSearchGraphSLF = true;
|
||||
} else {
|
||||
m_outputSearchGraphSLF = false;
|
||||
}
|
||||
if (m_parameter->GetParam("output-search-graph-hypergraph").size() > 0) {
|
||||
m_outputSearchGraphHypergraph = true;
|
||||
} else {
|
||||
m_outputSearchGraphHypergraph = false;
|
||||
}
|
||||
#ifdef HAVE_PROTOBUF
|
||||
if (m_parameter->GetParam("output-search-graph-pb").size() > 0) {
|
||||
if (m_parameter->GetParam("output-search-graph-pb").size() != 1) {
|
||||
|
@ -171,6 +171,7 @@ protected:
|
||||
bool m_reportAllFactorsNBest;
|
||||
std::string m_detailedTranslationReportingFilePath;
|
||||
bool m_onlyDistinctNBest;
|
||||
bool m_PrintAlignmentInfo;
|
||||
bool m_needAlignmentInfo;
|
||||
bool m_PrintAlignmentInfoNbest;
|
||||
|
||||
@ -216,6 +217,8 @@ protected:
|
||||
bool m_outputWordGraph; //! whether to output word graph
|
||||
bool m_outputSearchGraph; //! whether to output search graph
|
||||
bool m_outputSearchGraphExtended; //! ... in extended format
|
||||
bool m_outputSearchGraphSLF; //! whether to output search graph in HTK standard lattice format (SLF)
|
||||
bool m_outputSearchGraphHypergraph; //! whether to output search graph in hypergraph
|
||||
#ifdef HAVE_PROTOBUF
|
||||
bool m_outputSearchGraphPB; //! whether to output search graph as a protobuf
|
||||
#endif
|
||||
@ -458,7 +461,7 @@ public:
|
||||
return m_nBestFilePath;
|
||||
}
|
||||
bool IsNBestEnabled() const {
|
||||
return (!m_nBestFilePath.empty()) || m_mbr || m_useLatticeMBR || m_mira || m_outputSearchGraph || m_useConsensusDecoding || !m_latticeSamplesFilePath.empty()
|
||||
return (!m_nBestFilePath.empty()) || m_mbr || m_useLatticeMBR || m_mira || m_outputSearchGraph || m_outputSearchGraphSLF || m_outputSearchGraphHypergraph || m_useConsensusDecoding || !m_latticeSamplesFilePath.empty()
|
||||
#ifdef HAVE_PROTOBUF
|
||||
|| m_outputSearchGraphPB
|
||||
#endif
|
||||
@ -631,6 +634,12 @@ public:
|
||||
bool GetOutputSearchGraphExtended() const {
|
||||
return m_outputSearchGraphExtended;
|
||||
}
|
||||
bool GetOutputSearchGraphSLF() const {
|
||||
return m_outputSearchGraphSLF;
|
||||
}
|
||||
bool GetOutputSearchGraphHypergraph() const {
|
||||
return m_outputSearchGraphHypergraph;
|
||||
}
|
||||
#ifdef HAVE_PROTOBUF
|
||||
bool GetOutputSearchGraphPB() const {
|
||||
return m_outputSearchGraphPB;
|
||||
@ -722,6 +731,9 @@ public:
|
||||
const std::string &GetAlignmentOutputFile() const {
|
||||
return m_alignmentOutputFile;
|
||||
}
|
||||
bool PrintAlignmentInfo() const {
|
||||
return m_PrintAlignmentInfo;
|
||||
}
|
||||
bool PrintAlignmentInfoInNbest() const {
|
||||
return m_PrintAlignmentInfoNbest;
|
||||
}
|
||||
|
@ -256,7 +256,7 @@ void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameC
|
||||
if (kneserNeyFlag) {
|
||||
float D = kneserNey_D3;
|
||||
if (countEF < 2) D = kneserNey_D1;
|
||||
if (countEF < 3) D = kneserNey_D2;
|
||||
else if (countEF < 3) D = kneserNey_D2;
|
||||
if (D > countEF) D = countEF - 0.01; // sanity constraint
|
||||
|
||||
float p_b_E = n1_E / totalCount; // target phrase prob based on distinct
|
||||
|
@ -712,6 +712,10 @@ for(int fi=startF; fi<=endF; fi++) {
|
||||
if (m_options.isOrientationFlag())
|
||||
outextractstrOrientation << orientationInfo;
|
||||
|
||||
if (m_options.isIncludeSentenceIdFlag()) {
|
||||
outextractstr << " ||| " << sentence.sentenceID;
|
||||
}
|
||||
|
||||
if (m_options.getInstanceWeightsFile().length()) {
|
||||
if (m_options.isTranslationFlag()) {
|
||||
outextractstr << " ||| " << sentence.weightString;
|
||||
@ -722,9 +726,6 @@ for(int fi=startF; fi<=endF; fi++) {
|
||||
}
|
||||
}
|
||||
|
||||
if (m_options.isIncludeSentenceIdFlag()) {
|
||||
outextractstr << " ||| " << sentence.sentenceID;
|
||||
}
|
||||
|
||||
if (m_options.isTranslationFlag()) outextractstr << "\n";
|
||||
if (m_options.isTranslationFlag()) outextractstrInv << "\n";
|
||||
|
@ -13,10 +13,10 @@ chomp(@OUT);
|
||||
while(<SRC>) {
|
||||
chomp;
|
||||
if (/^<srcset/) {
|
||||
s/<srcset/<tstset trglang="$language"/;
|
||||
s/<srcset/<tstset trglang="$language"/i;
|
||||
}
|
||||
elsif (/^<\/srcset/) {
|
||||
s/<\/srcset/<\/tstset/;
|
||||
s/<\/srcset/<\/tstset/i;
|
||||
}
|
||||
elsif (/^<doc/i) {
|
||||
s/ *sysid="[^\"]+"//;
|
||||
@ -26,10 +26,10 @@ while(<SRC>) {
|
||||
my $line = shift(@OUT);
|
||||
$line = "" if $line =~ /NO BEST TRANSLATION/;
|
||||
if (/<\/seg>/) {
|
||||
s/(<seg[^>]+> *).*(<\/seg>)/$1$line$2/;
|
||||
s/(<seg[^>]+> *).*(<\/seg>)/$1$line$2/i;
|
||||
}
|
||||
else {
|
||||
s/(<seg[^>]+> *)[^<]*/$1$line/;
|
||||
s/(<seg[^>]+> *)[^<]*/$1$line/i;
|
||||
}
|
||||
}
|
||||
print $_."\n";
|
||||
|
@ -179,10 +179,13 @@ sub apply {
|
||||
$word =~ s/\|.+//g; # just first factor
|
||||
my $lc = lc($word);
|
||||
|
||||
print STDERR "considering $word ($lc)...\n" if $VERBOSE;
|
||||
# don't split frequent words
|
||||
if (defined($COUNT{$lc}) && $COUNT{$lc}>=$MAX_COUNT) {
|
||||
if ((defined($COUNT{$lc}) && $COUNT{$lc}>=$MAX_COUNT) ||
|
||||
$lc !~ /[a-zA-Z]/) {; # has to have at least one letter
|
||||
print join(" ",@BUFFER)." " if scalar(@BUFFER); @BUFFER = (); # clear buffer
|
||||
print $factored_word;
|
||||
print STDERR "\tfrequent word ($COUNT{$lc}>=$MAX_COUNT), skipping\n" if $VERBOSE;
|
||||
next;
|
||||
}
|
||||
|
||||
|
@ -1009,7 +1009,7 @@ sub extract_sgml_tag_and_span
|
||||
sub extract_sgml_tag_attribute
|
||||
{
|
||||
my ($name, $data) = @_;
|
||||
($data =~ m|$name\s*=\s*\"([^\"]*)\"|si) ? ($1) : ();
|
||||
($data =~ m|$name\s*=\s*\"?([^\"]*)\"?|si) ? ($1) : ();
|
||||
}
|
||||
|
||||
#################################
|
||||
|
@ -6,11 +6,12 @@ use Getopt::Long "GetOptions";
|
||||
binmode(STDIN, ":utf8");
|
||||
binmode(STDOUT, ":utf8");
|
||||
|
||||
|
||||
my ($SRC,$INFILE);
|
||||
my ($SRC,$INFILE,$UNBUFFERED);
|
||||
die("detruecase.perl < in > out")
|
||||
unless &GetOptions('headline=s' => \$SRC,
|
||||
'in=s' => \$INFILE);
|
||||
'in=s' => \$INFILE,
|
||||
'b|unbuffered' => \$UNBUFFERED);
|
||||
if (defined($UNBUFFERED) && $UNBUFFERED) { $|=1; }
|
||||
|
||||
my %SENTENCE_END = ("."=>1,":"=>1,"?"=>1,"!"=>1);
|
||||
my %DELAYED_SENTENCE_START = ("("=>1,"["=>1,"\""=>1,"'"=>1,"""=>1,"'"=>1,"["=>1,"]"=>1);
|
||||
|
@ -4,7 +4,7 @@
|
||||
use strict;
|
||||
use Getopt::Long "GetOptions";
|
||||
|
||||
my ($SRC,$INFILE,$RECASE_MODEL);
|
||||
my ($SRC,$INFILE,$RECASE_MODEL,$UNBUFFERED);
|
||||
my $MOSES = "moses";
|
||||
my $LANGUAGE = "en"; # English by default;
|
||||
die("recase.perl --in file --model ini-file > out")
|
||||
@ -12,9 +12,11 @@ die("recase.perl --in file --model ini-file > out")
|
||||
'headline=s' => \$SRC,
|
||||
'lang=s' => \$LANGUAGE,
|
||||
'moses=s' => \$MOSES,
|
||||
'model=s' => \$RECASE_MODEL)
|
||||
'model=s' => \$RECASE_MODEL,
|
||||
'b|unbuffered' => \$UNBUFFERED)
|
||||
&& defined($INFILE)
|
||||
&& defined($RECASE_MODEL);
|
||||
if (defined($UNBUFFERED) && $UNBUFFERED) { $|=1; }
|
||||
|
||||
my %treated_languages = map { ($_,1) } qw/en cs/;
|
||||
die "I don't know any rules for $LANGUAGE. Use 'en' as the default."
|
||||
|
@ -8,9 +8,11 @@ binmode(STDIN, ":utf8");
|
||||
binmode(STDOUT, ":utf8");
|
||||
|
||||
# apply switches
|
||||
my $MODEL;
|
||||
die("truecase.perl --model truecaser < in > out")
|
||||
unless &GetOptions('model=s' => \$MODEL);
|
||||
my ($MODEL, $UNBUFFERED);
|
||||
die("truecase.perl --model MODEL [-b] < in > out")
|
||||
unless &GetOptions('model=s' => \$MODEL,'b|unbuffered' => \$UNBUFFERED)
|
||||
&& defined($MODEL);
|
||||
if (defined($UNBUFFERED) && $UNBUFFERED) { $|=1; }
|
||||
|
||||
my (%BEST,%KNOWN);
|
||||
open(MODEL,$MODEL) || die("ERROR: could not open '$MODEL'");
|
||||
|
@ -171,7 +171,7 @@ if ($TIMING)
|
||||
|
||||
# tokenize a batch of texts saved in an array
|
||||
# input: an array containing a batch of texts
|
||||
# return: another array cotaining a batch of tokenized texts for the input array
|
||||
# return: another array containing a batch of tokenized texts for the input array
|
||||
sub tokenize_batch
|
||||
{
|
||||
my(@text_list) = @_;
|
||||
|
@ -47,7 +47,7 @@ my $l1input = "$corpus.$l1";
|
||||
if (-e $l1input) {
|
||||
$opn = $l1input;
|
||||
} elsif (-e $l1input.".gz") {
|
||||
$opn = "zcat $l1input.gz |";
|
||||
$opn = "gunzip -c $l1input.gz |";
|
||||
} else {
|
||||
die "Error: $l1input does not exist";
|
||||
}
|
||||
@ -57,7 +57,7 @@ my $l2input = "$corpus.$l2";
|
||||
if (-e $l2input) {
|
||||
$opn = $l2input;
|
||||
} elsif (-e $l2input.".gz") {
|
||||
$opn = "zcat $l2input.gz |";
|
||||
$opn = "gunzip -c $l2input.gz |";
|
||||
} else {
|
||||
die "Error: $l2input does not exist";
|
||||
}
|
||||
@ -160,3 +160,4 @@ sub word_count {
|
||||
my @w = split(/ /,$line);
|
||||
return scalar @w;
|
||||
}
|
||||
|
||||
|
@ -40,7 +40,8 @@ def printUsage():
|
||||
def main():
|
||||
parser = optparse.OptionParser()
|
||||
parser.add_option("-c", "--min-non-initial-rule-count",
|
||||
action="store", dest="minCount", type="int", default="1",
|
||||
action="store", dest="minCount",
|
||||
type="float", default="0.0",
|
||||
help="prune non-initial rules where count is below N",
|
||||
metavar="N")
|
||||
(options, args) = parser.parse_args()
|
||||
|
Loading…
Reference in New Issue
Block a user