Merge branch 'master' of github.com:moses-smt/mosesdecoder

2024-12-26 05:14:36 +03:00 · 2013-03-15 20:38:42 +00:00 · 2013-03-15 20:38:42 +00:00 · 18e8f12d5e
commit 18e8f12d5e
parent 8523a27768 df5f0934be
65 changed files with 2028 additions and 165 deletions
--- a/.gitmodules
+++ b/.gitmodules
@ -0,0 +1,3 @@
+[submodule "contrib/arrow-pipelines/python/libs/pypeline"]
+	path = contrib/arrow-pipelines/python/libs/pypeline
+	url = git://github.com/ianj-als/pypeline.git
--- a/BUILD-INSTRUCTIONS.txt
+++ b/BUILD-INSTRUCTIONS.txt
@ -45,7 +45,7 @@ ADVICE ON INSTALLING EXTERNAL LIBRARIES
 Generally, for trouble installing external libraries, you should get support
 directly from the library maker:

-Boost: http://www.boost.org/doc/libs/1_48_0/more/getting_started/unix-variants.html
+Boost: http://www.boost.org/doc/libs/release/more/getting_started/unix-variants.html
 IRSTLM: https://list.fbk.eu/sympa/subscribe/user-irstlm
 SRILM: http://www.speech.sri.com/projects/srilm/#srilm-user

--- a/2
+++ b/2
@ -1,3 +1,5 @@
 This code includes data from Daniel Naber's Language Tools (czech abbreviations).

 This code includes data from czech wiktionary (also czech abbreviations).
+
+
--- a/contrib/arrow-pipelines/documentation/training-pipeline/moses-pypeline.dia
+++ b/contrib/arrow-pipelines/documentation/training-pipeline/moses-pypeline.dia
--- a/contrib/arrow-pipelines/python/README
+++ b/contrib/arrow-pipelines/python/README
@ -0,0 +1,32 @@
+Arrow Based Moses Training Pipeline
+===================================
+
+To use the demonstration you must first initialise the git submodules for this clone. Return to the top level directory and issue the following command:
+
+$ git submodule init
+
+This will clone the Pypeline submodule that is available on GitHub (https://github.com/ianj-als/pypeline). To install Pypeline:
+
+$ cd libs/pypeline
+$ python setup.py install
+
+Alternatively, you can set an appropriate PYTHONPATH enviornment variable to the Pypeline library.
+
+This demonstration implements a training pipeline that is shown in the Dia diagram in ../documentation/training-pipeline/moses-pypeline.dia.
+
+Three environment variables need to be set before the manager.py script can be run, they are:
+
+ - MOSES_HOME : The directory where Moses has been cloned, or installed,
+ - IRSTLM : The installation directory of your IRSTLM, and
+ - GIZA_HOME : The installation directory of GIZA++.
+
+The manager.py script takes four positional command-line arguments:
+
+ - The source language code,
+ - The target language code,
+ - The source corpus file. This file *must* be cleaned prior to use, and
+ - The target corpus file. This file *must* be cleaned prior to use.
+
+For example, run the manager.py script with:
+
+$ python manager.py en lt cleantrain.en cleantrain.lt
--- a/contrib/arrow-pipelines/python/libs/pypeline
+++ b/contrib/arrow-pipelines/python/libs/pypeline
@ -0,0 +1 @@
+Subproject commit a7084b686f5196f1bbac5d389b4a6cd7f15c83fb
--- a/contrib/arrow-pipelines/python/manager.py
+++ b/contrib/arrow-pipelines/python/manager.py
@ -0,0 +1,192 @@
+import logging
+import os
+
+from concurrent.futures import Future, ThreadPoolExecutor
+from functools import partial
+from pypeline.helpers.parallel_helpers import eval_pipeline, \
+    cons_function_component, \
+    cons_wire, \
+    cons_split_wire, \
+    cons_unsplit_wire, \
+    cons_dictionary_wire
+
+
+#
+# Some logging please
+#
+FORMAT = '%(asctime)-15s : %(threadName)s : %(levelname)s - %(message)s'
+logging.basicConfig(format = FORMAT, level = logging.DEBUG)
+logger = logging.getLogger("manager")
+
+
+# Build the pipeline components
+def build_components(components, configuration, executor):
+  pipeline_components = dict()
+  pipeline_configuration = dict()
+
+  for component_id, module_name in components.items():
+    logger.info("Loading [%s] component from [%s]..." % (component_id, module_name))
+
+    module = __import__(module_name, fromlist = ['configure', 'initialise'])
+    
+    # Component builds its own configuration object
+    config_func = getattr(module, 'configure')
+    component_config = config_func(configuration)
+    pipeline_configuration.update(component_config)
+
+    # Now build the component
+    init_func = getattr(module, 'initialise')
+    component_function = init_func(component_config)
+
+    # A wrapper for the component's function that submits to the executor
+    def get_component_function_wrapper(inner_function, comp_id, mod_name):
+      def component_function_wrapper(a, s):
+        logger.info("Running component [%s], from module [%s], with value [%s] and state [%s]..." % \
+                    (comp_id, mod_name, a, s))
+        return inner_function(a, s)
+
+      return component_function_wrapper
+
+    # Arrowize the component
+    component = cons_function_component(get_component_function_wrapper(component_function, component_id, module_name))
+
+    # And store
+    pipeline_components[component_id] = component
+
+  return pipeline_components, pipeline_configuration
+
+
+# Go!
+def main(src_lang, trg_lang, src_filename, trg_filename):
+  # Global configuration
+  # One day, this configuration shall be constructed from
+  # command line options, or a properties file.
+  configuration = {
+    'moses_installation_dir': os.environ['MOSES_HOME'],
+    'irstlm_installation_dir': os.environ['IRSTLM'],
+    'giza_installation_dir': os.environ['GIZA_HOME'],
+    'src_lang': src_lang,
+    'src_tokenisation_dir': './tokenisation',
+    'trg_lang': trg_lang,
+    'trg_tokenisation_dir': './tokenisation',
+    'segment_length_limit': 60,
+    'irstlm_smoothing_method': 'improved-kneser-ney',
+    'language_model_directory': './language-model',
+    'translation_model_directory': './translation-model',
+    'mert_working_directory': './mert',
+    'evaluation_data_size': 100,
+    'development_data_size': 100
+  }
+
+  # The modules to load
+  # In the future, the components shall be specified in some kind
+  # pipeline description file.
+  component_modules = {
+    'src_tokenizer': 'training.components.tokenizer.src_tokenizer',
+    'trg_tokenizer': 'training.components.tokenizer.trg_tokenizer',
+    'cleanup': 'training.components.cleanup.cleanup',
+    'data_split': 'training.components.data_split.data_split',
+    'irstlm_build': 'training.components.irstlm_build.irstlm_build',
+    'model_training': 'training.components.model_training.model_training',
+    'mert': 'training.components.mert.mert'
+  }
+
+  # The thread pool
+  executor = ThreadPoolExecutor(max_workers = 3)
+
+  # Phew, build the required components
+  components, component_config = build_components(component_modules, configuration, executor)
+
+  #
+  # Wire up components
+  # Description of wiring should be, in the future, alongside the component
+  # specification in some kind of confuguration file. Components shall be
+  # declared then used, i.e., bind a component instance to a unique component
+  # identifier, then wire component instances together by identifier.
+  #
+
+  #
+  # Tokenisation of source and target...
+  #
+  # IRSTLM Build components
+  irstlm_build_component = cons_split_wire() >> \
+                           (cons_wire(lambda a, s: {'input_filename':  a['tokenised_trg_filename']}) >> \
+                            components['irstlm_build']).second() >> \
+                           cons_unsplit_wire(lambda t, b: {'tokenised_trg_filename': t['tokenised_trg_filename'],
+                                                           'trg_language_model_filename': b['compiled_lm_filename']})
+
+  # The complete tokenisation component
+  tokenisation_component = (components['src_tokenizer'] & components['trg_tokenizer']) >> \
+                           irstlm_build_component.second() >> \
+                           cons_unsplit_wire(lambda t, b: {'src_filename': t['tokenised_src_filename'],
+                                                           'trg_filename': b['tokenised_trg_filename'],
+                                                           'trg_language_model_filename': b['trg_language_model_filename']})
+
+  #
+  # Cleanup and Data Spliting...
+  #
+
+  #
+  # A function that clips off the last '.' delimited string
+  #
+  def clip_last_bit(filename):
+    bn = os.path.basename(filename)
+    directory = os.path.dirname(filename)
+    bits = bn.split(".")
+    bits.pop()
+    return os.path.join(directory, ".".join(bits))
+
+  cleanup_datasplit_component = components['cleanup'] >> \
+                                cons_wire(lambda a, s: {'src_filename': a['cleaned_src_filename'],
+                                                        'trg_filename': a['cleaned_trg_filename']}) >> \
+                                components['data_split'] >> \
+                                cons_wire(lambda a, s: {'training_data_filename': clip_last_bit(a['train_src_filename']),
+                                                        'eval_src_filename': a['eval_src_filename'],
+                                                        'eval_trg_filename': a['eval_trg_filename']})
+
+  #
+  # Translation model training
+  #
+  translation_model_component = cons_split_wire() >> \
+                                components['model_training'].first() >> \
+                                cons_unsplit_wire(lambda t, b: {'moses_ini_file': t['moses_ini_file'],
+                                                                'development_data_filename': b['eval_src_filename']})
+
+  #
+  # The whole pipeline
+  #
+  pipeline = tokenisation_component >> \
+             cons_split_wire() >> \
+             (cleanup_datasplit_component >> translation_model_component).first() >> \
+             cons_unsplit_wire(lambda t, b: {'moses_ini_file': t['moses_ini_file'],
+                                             'development_data_filename': clip_last_bit(t['development_data_filename']),
+                                             'trg_language_model_filename': b['trg_language_model_filename'],
+                                             'trg_language_model_order': 3,
+                                             'trg_language_model_type': 9}) >> \
+             components['mert']
+
+
+  #
+  # The input to the pipeline
+  #
+  value = {'src_filename': src_filename,
+           'trg_filename': trg_filename}
+
+  #
+  # Evaluate the pipeline
+  #
+  logger.info("Evaluating pipeline with input [%s]..." % value)
+  new_value = eval_pipeline(executor, pipeline, value, component_config)
+
+  #
+  # Wait for all components to finish
+  #
+  executor.shutdown(True)
+  
+  logger.info("Pipeline evaluated to %s" % new_value)
+
+
+if __name__ == '__main__':
+  import sys
+
+  main(sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4])
--- a/contrib/arrow-pipelines/python/test/init.py
+++ b/contrib/arrow-pipelines/python/test/init.py
--- a/contrib/arrow-pipelines/python/test/test.py
+++ b/contrib/arrow-pipelines/python/test/test.py
@ -0,0 +1,11 @@
+import subprocess
+
+def cat(filename, content):
+  fh = open(filename, "w")
+  for line in content:
+    #print(line, file=fh)
+    print >> fh, line
+  fh.close()
+
+def diff(filename1, filename2):
+  subprocess.check_output(["diff", filename1, filename2], stderr=subprocess.STDOUT)
--- a/contrib/arrow-pipelines/python/training/init.py
+++ b/contrib/arrow-pipelines/python/training/init.py
--- a/contrib/arrow-pipelines/python/training/components/init.py
+++ b/contrib/arrow-pipelines/python/training/components/init.py
--- a/contrib/arrow-pipelines/python/training/components/cleanup/init.py
+++ b/contrib/arrow-pipelines/python/training/components/cleanup/init.py
--- a/contrib/arrow-pipelines/python/training/components/cleanup/cleanup.py
+++ b/contrib/arrow-pipelines/python/training/components/cleanup/cleanup.py
@ -0,0 +1,125 @@
+from pypeline.helpers.helpers import cons_function_component
+
+def configure(args):
+  result = {}
+  result['segment_length'] = args['segment_length_limit']
+  return result
+
+def initialise(config):
+  def _filter(limit, ifh1, ofh1, ifh2, ofh2):
+    def _short(line):
+      n = 0
+      for c in line:
+        if c == " ":
+          n += 1
+      #print(line, ":", n)
+      return n < limit
+
+    for (l1, l2) in zip(ifh1, ifh2):
+      if _short(l1) and _short(l2):
+        print >>ofh1, l1,
+        print >>ofh2, l2,
+
+  def _make_cleaned_filename(filename):
+    bits = filename.split(".")
+    bits[-1] = "clean"
+    return ".".join(bits)
+
+  def _filter_main(value, config):
+    limit = config['segment_length']
+    (ifh1, ifh2, ofh1, ofh2) = (None, None, None, None)
+    try:
+      input_src_filename = value['src_filename']
+      input_trg_filename = value['trg_filename']
+
+      print "Cleanup: Cleaning [%s] and [%s]..." % (input_src_filename, input_trg_filename)
+
+      ifh1 = open(input_src_filename, "r")
+      ifh2 = open(input_trg_filename, "r")
+
+      cleaned_src_filename = _make_cleaned_filename(input_src_filename)
+      cleaned_trg_filename = _make_cleaned_filename(input_trg_filename)
+      ofh1 = open(cleaned_src_filename, "w")
+      ofh2 = open(cleaned_trg_filename, "w")
+
+      _filter(limit, ifh1, ofh1, ifh2, ofh2)
+
+      return {'cleaned_src_filename': cleaned_src_filename,
+              'cleaned_trg_filename': cleaned_trg_filename}
+    finally:
+      def _safe_close(fh):
+        if fh is not None:
+          fh.close()
+      _safe_close(ifh1)
+      _safe_close(ifh2)
+      _safe_close(ofh1)
+      _safe_close(ofh2)
+    
+  return _filter_main
+
+
+if __name__ == '__main__':
+  import os
+  import tempfile
+  import test.test as thelp
+
+  from pypeline.helpers.helpers import eval_pipeline
+
+
+  def _test_main():
+    configuration = {'segment_length_limit': 20}
+
+    src_filename = tempfile.mkstemp(suffix = ".src", dir = "/tmp")
+    trg_filename = tempfile.mkstemp(suffix = ".trg", dir = "/tmp")
+
+    box_eval = {
+      'src_filename': src_filename[1],
+      'trg_filename': trg_filename[1],
+      'cleaned_src_file_expected': src_filename[1] + ".expected",
+      'cleaned_trg_file_expected': trg_filename[1] + ".expected"
+    }
+
+    try:
+      _prep_files(box_eval)
+      _run_test(configuration, box_eval)
+    finally:
+      _cleanup_files(box_eval)
+
+
+  def _run_test(configuration, box_eval):
+    box_config = configure(configuration)
+    box = initialise(box_config)
+    
+    output = eval_pipeline(box, box_eval, box_config)
+    try:
+      thelp.diff(box_eval['cleaned_src_file_expected'], output['cleaned_src_filename'])
+      thelp.diff(box_eval['cleaned_trg_file_expected'], output['cleaned_trg_filename'])
+    finally:
+      os.unlink(output['cleaned_src_filename'])
+      os.unlink(output['cleaned_trg_filename'])
+
+
+  def _line(line_lengths):
+    def _gen_line(tokens):
+      return " ".join(map(lambda n: "tok" + str(n), range(tokens)))
+    return map(_gen_line, line_lengths)
+
+
+  def _prep_files(box_eval):
+    thelp.cat(box_eval['src_filename'], _line([10, 20, 30, 40, 17, 21]))
+    thelp.cat(box_eval['trg_filename'], _line([40, 30, 20, 10, 20, 21]))
+    #expected output:
+    thelp.cat(box_eval['cleaned_src_file_expected'], _line([17]))
+    thelp.cat(box_eval['cleaned_trg_file_expected'], _line([20]))
+
+
+  def _cleanup_files(box_eval):
+    try:
+      for key, filename in box_eval.items():
+        os.unlink(filename)
+    except:
+      pass
+
+
+  _test_main()
+
--- a/contrib/arrow-pipelines/python/training/components/cleanup/cleanup3.py
+++ b/contrib/arrow-pipelines/python/training/components/cleanup/cleanup3.py
@ -0,0 +1,109 @@
+from pypeline.helpers.helpers import cons_function_component
+
+def configure(args):
+  result = {}
+  result['segment_length'] = args['segment_length_limit']
+  return result
+
+def initialise(config):
+  def _filter(limit, ifh1, ofh1, ifh2, ofh2):
+    def _short(line):
+      n = 0
+      for c in line:
+        if c == " ":
+          n += 1
+      #print(line, ":", n)
+      return n < limit
+
+    for (l1, l2) in zip(ifh1, ifh2):
+      if _short(l1) and _short(l2):
+        print(l1, end='', file=ofh1)
+        print(l2, end='', file=ofh2)
+
+  def _filter_main(config, value):
+    limit = config['segment_length']
+    (ifh1, ifh2, ofh1, ofh2) = (None, None, None, None)
+    try:
+      ifh1 = open(value['src_filename'], "r")
+      ifh2 = open(value['trg_filename'], "r")
+      ofh1 = open(value['cleaned_src_filename'], "w")
+      ofh2 = open(value['cleaned_trg_filename'], "w")
+
+      _filter(limit, ifh1, ofh1, ifh2, ofh2)
+
+      return {'cleaned_src_filename': value['cleaned_src_filename'],
+              'cleaned_trg_filename': value['cleaned_trg_filename']}
+    finally:
+      def _safe_close(fh):
+        if fh is not None:
+          fh.close()
+      _safe_close(ifh1)
+      _safe_close(ifh2)
+      _safe_close(ofh1)
+      _safe_close(ofh2)
+    
+  return cons_function_component(_filter_main)
+
+
+if __name__ == '__main__':
+  import os
+  import tempfile
+  import training.components.shared.test as thelp
+
+
+  def _test_main():
+    configuration = {'segment_length_limit': 20}
+
+    src_filename = tempfile.mkstemp(suffix = "src", dir = "/tmp")
+    trg_filename = tempfile.mkstemp(suffix = "trg", dir = "/tmp")
+
+    box_eval = {
+      'src_filename': src_filename[1],
+      'trg_filename': trg_filename[1],
+      'cleaned_src_filename': src_filename[1] + ".clean",
+      'cleaned_trg_filename': trg_filename[1] + ".clean",
+      'cleaned_src_file_expected': src_filename[1] + ".expected",
+      'cleaned_trg_file_expected': trg_filename[1] + ".expected"
+    }
+
+    try:
+      _prep_files(box_eval)
+      _run_test(configuration, box_eval)
+    finally:
+      _cleanup_files(box_eval)
+
+
+  def _run_test(configuration, box_eval):
+    from pypeline.helpers.helpers import run_pipeline
+    box_config = configure(configuration)
+    box = initialise(box_config)
+    
+    run_pipeline(box, box_config, box_eval)
+    thelp.diff(box_eval['cleaned_src_file_expected'], box_eval['cleaned_src_filename'])
+    thelp.diff(box_eval['cleaned_trg_file_expected'], box_eval['cleaned_trg_filename'])
+
+
+  def _line(line_lengths):
+    def _gen_line(tokens):
+      return " ".join(map(lambda n: "tok" + str(n), range(tokens)))
+    return map(_gen_line, line_lengths)
+
+
+  def _prep_files(box_eval):
+    thelp.cat(box_eval['src_filename'], _line([10, 20, 30, 40, 17, 21]))
+    thelp.cat(box_eval['trg_filename'], _line([40, 30, 20, 10, 20, 21]))
+    #expected output:
+    thelp.cat(box_eval['cleaned_src_file_expected'], _line([17]))
+    thelp.cat(box_eval['cleaned_trg_file_expected'], _line([20]))
+
+
+  def _cleanup_files(box_eval):
+    try:
+      for key, filename in box_eval.items():
+        os.unlink(filename)
+    except:
+      pass
+
+
+  _test_main()
+
--- a/contrib/arrow-pipelines/python/training/components/data_split/init.py
+++ b/contrib/arrow-pipelines/python/training/components/data_split/init.py
--- a/contrib/arrow-pipelines/python/training/components/data_split/data_split.py
+++ b/contrib/arrow-pipelines/python/training/components/data_split/data_split.py
@ -0,0 +1,146 @@
+from pypeline.helpers.helpers import cons_function_component
+
+def configure(args):
+  result = {}
+  result['evaluate_size'] = args['evaluation_data_size']
+  result['development_size'] = args['development_data_size']
+  return result
+
+def initialise(config):
+
+  def _copy(size, inp, ofh1, ofh2):
+    try:
+      while size != 0:
+        (l1, l2) = inp.next()
+        print >>ofh1, l1,
+        print >>ofh2, l2,
+        size -= 1
+    except StopIteration:
+      pass
+
+  def _make_split_filename(filename, data_set):
+    bits = filename.split(".")
+    last = bits.pop()
+    lang_code = bits.pop()
+    
+    bits.append(last)
+    bits.append(data_set)
+    bits.append(lang_code)
+
+    new_filename = ".".join(bits)
+    return new_filename
+
+  def _splitter_main(value, config):
+    (ifh1, ifh2, ofh1, ofh2) = (None, None, None, None)
+    try:
+      input_src_filename = value['src_filename']
+      input_trg_filename = value['trg_filename']
+
+      ifh1 = open(input_src_filename, "r")
+      ifh2 = open(input_trg_filename, "r")
+      inp = iter(zip(ifh1, ifh2))
+
+      result = {}
+      for (data_set, size) in [
+        ('devel', config['development_size']),
+        ('eval', config['evaluate_size']),
+        ('train', -1)
+                ]:
+        output_src_filename = _make_split_filename(input_src_filename, data_set)
+        output_trg_filename = _make_split_filename(input_trg_filename, data_set)
+        ofh1 = open(output_src_filename, "w")
+        ofh2 = open(output_trg_filename, "w")
+
+        _copy(size, inp, ofh1, ofh2)
+        result[data_set + '_src_filename'] = output_src_filename
+        result[data_set + '_trg_filename'] = output_trg_filename
+
+      return result
+
+    finally:
+      def _safe_close(fh):
+        if fh is not None:
+          fh.close()
+      _safe_close(ifh1)
+      _safe_close(ifh2)
+      _safe_close(ofh1)
+      _safe_close(ofh2)
+    
+  return _splitter_main
+
+
+if __name__ == '__main__':
+  import os
+  import tempfile
+  import test.test as thelp
+
+  from pypeline.helpers.helpers import eval_pipeline
+
+
+  def _test_main():
+    configuration = {
+      'evaluation_data_size': 7,
+      'development_data_size': 13,
+    }
+
+    src_filename = tempfile.mkstemp(suffix = ".src", dir = "/tmp")
+    trg_filename = tempfile.mkstemp(suffix = ".trg", dir = "/tmp")
+
+    box_eval = {
+      'src_filename': src_filename[1],
+      'trg_filename': trg_filename[1],
+      'devel_src_expected': src_filename[1] + ".devel.expected",
+      'devel_trg_expected': trg_filename[1] + ".devel.expected",
+      'eval_src_expected': src_filename[1] + ".eval.expected",
+      'eval_trg_expected': trg_filename[1] + ".eval.expected",
+      'train_src_expected': src_filename[1] + ".train.expected",
+      'train_trg_expected': trg_filename[1] + ".train.expected",
+    }
+
+    try:
+      _prep_files(box_eval)
+      _run_test(configuration, box_eval)
+    finally:
+      _cleanup_files(box_eval)
+
+
+  def _run_test(configuration, box_eval):
+    box_config = configure(configuration)
+    box = initialise(box_config)
+    
+    output = eval_pipeline(box, box_eval, box_config)
+    for data_set in ['devel', 'eval', 'train']:
+      for lang in ['src', 'trg']:
+        filename = output[data_set + '_' + lang + '_filename']
+        filename_expected = box_eval[data_set + '_' + lang + '_expected']
+      thelp.diff(filename_expected, filename)
+
+
+  def _line(line_lengths):
+    def _gen_line(tokens):
+      return " ".join(map(lambda n: "tok" + str(n), range(tokens)))
+    return map(_gen_line, line_lengths)
+
+
+  def _prep_files(box_eval):
+    thelp.cat(box_eval['src_filename'], _line(range(50)))
+    thelp.cat(box_eval['trg_filename'], _line(range(50)))
+    #expected output:
+    thelp.cat(box_eval['devel_src_expected'], _line(range(0,13)))
+    thelp.cat(box_eval['devel_trg_expected'], _line(range(0,13)))
+    thelp.cat(box_eval['eval_src_expected'], _line(range(13,20)))
+    thelp.cat(box_eval['eval_trg_expected'], _line(range(13,20)))
+    thelp.cat(box_eval['train_src_expected'], _line(range(20,50)))
+    thelp.cat(box_eval['train_trg_expected'], _line(range(20,50)))
+
+
+  def _cleanup_files(box_eval):
+    try:
+      for key, filename in box_eval.items():
+        os.unlink(filename)
+    except:
+      pass
+
+
+  _test_main()
+
--- a/contrib/arrow-pipelines/python/training/components/irstlm_build/init.py
+++ b/contrib/arrow-pipelines/python/training/components/irstlm_build/init.py
--- a/contrib/arrow-pipelines/python/training/components/irstlm_build/irstlm_build.py
+++ b/contrib/arrow-pipelines/python/training/components/irstlm_build/irstlm_build.py
@ -0,0 +1,106 @@
+import os
+import shutil
+import subprocess
+import tempfile
+
+from pypeline.helpers.helpers import cons_function_component
+
+def configure(args):
+    config = dict()
+    config['irstlm_install_directory'] = args['irstlm_installation_dir']
+    config['smoothing_method'] = args['irstlm_smoothing_method']
+    config['lm_directory'] = args['language_model_directory']
+    return config
+
+def initialise(config):
+    def process(a, s):
+        # Create the LM directory if we need to
+        if os.path.exists(s['lm_directory']) is False:
+            os.makedirs(s['lm_directory'])
+
+        # The filename of the file to chew through
+        start_end_input_filename = a['input_filename']
+        if os.path.exists(start_end_input_filename) is False:
+            raise Exception("IRSTLM Build: Input file could not be found at [%s]" % start_end_input_filename)
+
+        # Derive the output file name for the add start-end marker processor
+        filename_bits = os.path.basename(start_end_input_filename).split(".")
+        filename_bits[2] = "sb";
+        start_end_output_filename = os.path.join(s['lm_directory'], ".".join(filename_bits))
+
+        # Derive the output file name of the LM build
+        filename_bits[2] = "lm"
+        lm_filename = os.path.join(s['lm_directory'], ".".join(filename_bits))
+
+        # Derive the compiled LM file name
+        filename_bits[2] = "arpa"
+        compiled_lm_filename = os.path.join(s['lm_directory'], ".".join(filename_bits))
+
+        # First thing to do is add start and end markers
+        start_end_cmdline = [os.path.join(s['irstlm_install_directory'], "bin", "add-start-end.sh")]
+        infile = open(start_end_input_filename, 'r')
+        outfile = open(start_end_output_filename, 'w')
+        print "IRSTLM Build: Invoking [%s]..." % " ".join(start_end_cmdline)
+        return_code = subprocess.check_call(start_end_cmdline, stdin = infile, stdout = outfile)
+        if return_code:
+            raise Exception("IRSTLM add start and end markers failed: input file = [%s], output file = [%s], return code = [%d]" % \
+                            start_end_input_filename, start_end_output_filename, return_code)
+
+        # Next build the language model
+        tmp_dir = tempfile.mkdtemp(dir = "/tmp")
+        try:
+            build_lm_cmdline = [os.path.join(s['irstlm_install_directory'], "bin", "build-lm.sh"),
+                                "-i", start_end_output_filename,
+                                "-t", tmp_dir,
+                                "-p",
+                                "-s", s['smoothing_method'],
+                                "-o", lm_filename]
+            print "IRSTLM Build: Invoking [%s]..." % " ".join(build_lm_cmdline)
+            return_code = subprocess.check_call(build_lm_cmdline)
+            if return_code: 
+                raise Exception("IRST language model failed to build: return code = [%d]" % return_code)
+        finally:
+            if os.path.exists(tmp_dir):
+                shutil.rmtree(tmp_dir)
+
+        # Compile the LM
+        lm_filename = lm_filename + ".gz"
+        compile_lm_cmdline = [os.path.join(s['irstlm_install_directory'], "bin", "compile-lm"),
+                              "--text", "yes",
+                              lm_filename,
+                              compiled_lm_filename]
+        print "IRSTLM Build: Invoking [%s]..." % " ".join(compile_lm_cmdline)
+        return_code = subprocess.check_call(compile_lm_cmdline)
+        if return_code:
+            raise Exception("IRST language model compilation failed: return code = [%d]" % return_code)
+
+        output = {'add_start_end_filename': start_end_output_filename,
+                  'lm_filename': lm_filename,
+                  'compiled_lm_filename': compiled_lm_filename}
+
+        print "IRSTLM Build: Output = %s" % output
+
+        return output
+
+    return process
+
+
+if __name__ == '__main__':
+    from pypeline.helpers.helpers import eval_pipeline
+
+    lm_dir = os.environ["PWD"]
+    configuration = {'irstlm_root': os.environ["IRSTLM"],
+                     'irstlm_smoothing_method': 'improved-kneser-ney',
+                     'language_model_directory': lm_dir}
+    component_config = configure(configuration)
+    component = initialise(component_config)
+
+    value = eval_pipeline(component,
+                          {'input_filename': '/Users/ianjohnson/Dropbox/Documents/MTM2012/tokenised_files/news-commentary-v7.fr-en.tok.en'},
+                          component_config)
+    target = {'add_start_end_filename': os.path.join(lm_dir, 'news-commentary-v7.fr-en.sb.en'),
+              'lm_filename': os.path.join(lm_dir, 'news-commentary-v7.fr-en.lm.en.gz'),
+              'compiled_lm_filename': os.path.join(lm_dir, 'news-commentary-v7.fr-en.arpa.en')}
+    print "Target: %s" % target
+    if value != target:
+        raise Exception("Massive fail!")
--- a/contrib/arrow-pipelines/python/training/components/mert/init.py
+++ b/contrib/arrow-pipelines/python/training/components/mert/init.py
--- a/contrib/arrow-pipelines/python/training/components/mert/mert.py
+++ b/contrib/arrow-pipelines/python/training/components/mert/mert.py
@ -0,0 +1,83 @@
+#!/usr/bin/env python
+
+import os, shutil, subprocess
+
+from pypeline.helpers.helpers import cons_function_component
+
+def configure(args):
+    result = {}
+    result['src_lang'] = args['src_lang']
+    result['trg_lang'] = args['trg_lang']
+    result['moses_installation_dir'] = args['moses_installation_dir']
+    result['mert_working_dir'] = args['mert_working_directory']
+    return result
+
+def initialise(config):
+
+    def process(a, s):
+        infilename = os.path.abspath(a['development_data_filename'])
+        lm_file = os.path.abspath(a['trg_language_model_filename'])
+        lm_order = int(a['trg_language_model_order'])
+        lm_type = int(a['trg_language_model_type'])
+        orig_moses_ini = os.path.abspath(a['moses_ini_file'])
+        
+        if not os.path.exists(orig_moses_ini):
+            raise Exception, "Error: Input moses.ini does not exist"
+
+        workdir = os.path.abspath(config['mert_working_dir'])
+        #simply call the training perl script
+        #remove the workdir if it is already there
+        if os.path.exists(workdir):
+            shutil.rmtree(workdir)
+        os.makedirs(workdir)
+
+        #local vars
+        moses_install_dir = os.path.abspath(config['moses_installation_dir'])
+        mert_perl = os.path.join(moses_install_dir, 'scripts', 'training', 'mert-moses.pl')
+        bin_dir = os.path.join(moses_install_dir, 'bin')
+        moses_bin = os.path.join(moses_install_dir, 'bin', 'moses')
+        src_file = infilename + '.' + config['src_lang']
+        ref_file = infilename + '.' + config['trg_lang']
+        logfile = os.path.join(workdir, 'log')
+        #change lm configuration in moses ini
+        moses_ini = os.path.join(workdir, 'trained-moses.ini')
+        cmd = r"cat %(orig_moses_ini)s | sed '/\[lmodel-file\]/,/^[[:space:]]*$/c\[lmodel-file\]\n%(lm_type)s 0 %(lm_order)s %(lm_file)s\n' > %(moses_ini)s"
+        cmd = cmd % locals()
+        os.system(cmd)
+        
+        #the command
+        cmd = '%(mert_perl)s --mertdir %(bin_dir)s --working-dir %(workdir)s %(src_file)s %(ref_file)s %(moses_bin)s %(moses_ini)s 2> %(logfile)s'
+        cmd = cmd % locals()
+
+        pipe = subprocess.Popen(cmd, stdin = subprocess.PIPE, stdout = subprocess.PIPE, shell=True)
+        pipe.wait()
+
+        #check the moses ini
+        new_mosesini = os.path.join(workdir, 'moses.ini')
+        if not os.path.exists(new_mosesini):
+            raise Exception, 'Failed MERT'
+        
+        return {'moses_ini_file':new_mosesini}
+
+    return process
+
+if __name__ == '__main__':
+
+    def __test():
+        configuration = {'src_lang':'en',
+                         'trg_lang':'lt',
+                         'moses_installation_dir':os.path.abspath('../../../../'),
+                         'mert_working_dir':'../../../../../tuning'}
+        values = {'development_data_filename':'../../../../../corpus/tune',
+                  'moses_ini_file':'../../../../../model/model/moses.ini',
+                  'trg_language_model_filename':'../../../../../corpus/train.lt.lm',
+                  'trg_language_model_type':9,
+                  'trg_language_model_order':4}
+        from pypeline.helpers.helpers import run_pipeline
+        box_config = configure(configuration)
+        box = initialise(configuration)
+        print run_pipeline(box, values, None)
+
+    #do some test
+    __test()
+
--- a/contrib/arrow-pipelines/python/training/components/model_training/init.py
+++ b/contrib/arrow-pipelines/python/training/components/model_training/init.py
--- a/contrib/arrow-pipelines/python/training/components/model_training/model_training.py
+++ b/contrib/arrow-pipelines/python/training/components/model_training/model_training.py
@ -0,0 +1,72 @@
+#!/usr/bin/env python
+
+import os, shutil, subprocess
+
+from pypeline.helpers.helpers import cons_function_component
+
+def configure(args):
+    result = {}
+    result['src_lang'] = args['src_lang']
+    result['trg_lang'] = args['trg_lang']
+    result['moses_installation_dir'] = args['moses_installation_dir']
+    result['external_bin_dir'] = args['giza_installation_dir']
+    result['model_directory'] = args['translation_model_directory']
+    return result
+
+def initialise(config):
+
+    def process(a, s):
+        infilename = os.path.abspath(a['training_data_filename'])
+        workdir = os.path.abspath(config['model_directory'])
+        #simply call the training perl script
+        #remove the workdir if it is already there
+        if os.path.exists(workdir):
+            shutil.rmtree(workdir)
+        os.makedirs(workdir)
+        
+        #local vars
+        train_model_perl = os.path.abspath(config['moses_installation_dir']) + os.sep + 'scripts' + os.sep + 'training' + os.sep + 'train-model.perl'
+        src_lang = config['src_lang'].lower()
+        trg_lang = config['trg_lang'].lower()
+        external_bin = os.path.abspath(config['external_bin_dir'])
+        #create a dummy lm file
+        dummy_lmfile = workdir + os.sep + 'dummy.lm'
+        f = open(dummy_lmfile, 'w')
+        print >> f, "dummy lm file"
+        f.close()
+        logfile = workdir + os.sep + 'log'
+        
+        #the command
+        cmd = '%(train_model_perl)s -root-dir %(workdir)s -corpus %(infilename)s -f %(src_lang)s -e %(trg_lang)s -alignment grow-diag-final-and -reordering msd-bidirectional-fe -lm 0:5:%(dummy_lmfile)s:0 -external-bin-dir %(external_bin)s 2> %(logfile)s'
+
+        cmd = cmd % locals()
+
+        pipe = subprocess.Popen(cmd, stdin = subprocess.PIPE, stdout = subprocess.PIPE, shell=True)
+        pipe.wait()
+
+        #check the moses ini
+        mosesini = workdir + os.sep + 'model' + os.sep + 'moses.ini'
+        if not os.path.exists(mosesini):
+            raise Exception, 'Failed training model'
+        
+        return {'moses_ini_file':mosesini}
+
+    return process
+
+if __name__ == '__main__':
+
+    def __test():
+        configuration = {'src_lang':'en',
+                         'trg_lang':'lt',
+                         'moses_installation_dir':os.environ['MOSES_HOME'],
+                         'giza_installation_dir':os.environ['GIZA_HOME'],
+                         'translation_model_directory':'model-dir'}
+        values = {'training_data_filename':'/Users/ianjohnson/work/MTM-2012/corpus/training/cleantrain'}
+        from pypeline.helpers.helpers import run_pipeline
+        box_config = configure(configuration)
+        box = initialise(box_config)
+        print run_pipeline(box, values, None)
+
+    #do some test
+    __test()
+
--- a/contrib/arrow-pipelines/python/training/components/tokenizer/init.py
+++ b/contrib/arrow-pipelines/python/training/components/tokenizer/init.py
--- a/contrib/arrow-pipelines/python/training/components/tokenizer/src_tokenizer.py
+++ b/contrib/arrow-pipelines/python/training/components/tokenizer/src_tokenizer.py
@ -0,0 +1,43 @@
+#!/usr/bin/env python
+
+import os
+
+from tokenizer import Tokenizer
+
+from pypeline.helpers.helpers import cons_function_component
+
+def configure(args):
+    result = {}
+    result['src_lang'] = args['src_lang']
+    result['src_tokenisation_dir'] = args['src_tokenisation_dir']
+    result['moses_installation_dir'] = args['moses_installation_dir']
+    return result
+
+def initialise(config):
+
+    def process(a, s):
+        infilename = a['src_filename']
+        outfilename = Tokenizer.batch_tokenise(
+            config['src_lang'], 
+            config['moses_installation_dir'], 
+            infilename, 
+            config['src_tokenisation_dir'])
+        return {'tokenised_src_filename':outfilename}
+
+    return process
+
+if __name__ == '__main__':
+
+    def __test():
+        configuration = {'src_lang':'de',
+                         'src_tokenisation_dir':'tmptok',
+                         'moses_installation_dir':os.path.abspath('../../../../')}
+        values = {'src_filename':'tmp.de'}
+        from pypeline.helpers.helpers import run_pipeline
+        box_config = configure(configuration)
+        box = initialise(configuration)
+        print run_pipeline(box, values, None)
+
+    #do some test
+    __test()
+
--- a/contrib/arrow-pipelines/python/training/components/tokenizer/tmp.de
+++ b/contrib/arrow-pipelines/python/training/components/tokenizer/tmp.de
@ -0,0 +1,3 @@
+asdfweoih
+awfwoeijf awefo
+what's this
--- a/contrib/arrow-pipelines/python/training/components/tokenizer/tokenizer.py
+++ b/contrib/arrow-pipelines/python/training/components/tokenizer/tokenizer.py
@ -0,0 +1,36 @@
+#!/usr/bin/env python
+
+import sys, os, subprocess
+
+class Tokenizer:
+    
+    @staticmethod
+    def batch_tokenise(lang, mosesdir, infilename, workdir):
+        print "Tokenizing [%s] in working directory [%s]..." % (infilename, workdir)
+        if not os.path.exists(workdir):
+            os.makedirs(workdir)
+        tok = Tokenizer(lang, mosesdir)
+        basefilename = os.path.basename(infilename)
+        outfilename = workdir + os.sep + basefilename + '.tok'
+        tok.file_tokenise(infilename, outfilename)
+        return outfilename
+        
+    def __init__(self, lang, mosesdir):
+        self.arrows = None
+        self.lang = lang
+        #check the perl tokenizer is here
+        #path = os.path.dirname(os.path.abspath(__file__))
+        path = mosesdir + os.sep + 'scripts' + os.sep + 'tokenizer'
+        self.perltok = path + os.sep + 'tokenizer.perl'
+        if not os.path.exists(path):
+            raise Exception, "Perl tokenizer does not exists"
+
+    def file_tokenise(self, infilename, outfilename):
+        cmd = '%s -q -l %s < %s > %s' % (self.perltok, self.lang, infilename, outfilename)
+        pipe = subprocess.Popen(cmd, stdin = subprocess.PIPE, stdout = subprocess.PIPE, shell=True)
+        pipe.wait()
+
+if __name__ == '__main__':
+    #do some test
+    pass
+
--- a/contrib/arrow-pipelines/python/training/components/tokenizer/trg_tokenizer.py
+++ b/contrib/arrow-pipelines/python/training/components/tokenizer/trg_tokenizer.py
@ -0,0 +1,43 @@
+#!/usr/bin/env python
+
+import os
+
+from tokenizer import Tokenizer
+
+from pypeline.helpers.helpers import cons_function_component
+
+def configure(args):
+    result = {}
+    result['trg_lang'] = args['trg_lang']
+    result['trg_tokenisation_dir'] = args['trg_tokenisation_dir']
+    result['moses_installation_dir'] = args['moses_installation_dir']
+    return result
+
+def initialise(config):
+
+    def process(a, s):
+        infilename = a['trg_filename']
+        outfilename = Tokenizer.batch_tokenise(
+            config['trg_lang'], 
+            config['moses_installation_dir'],
+            infilename, 
+            config['trg_tokenisation_dir'])
+        return {'tokenised_trg_filename':outfilename}
+
+    return process
+
+if __name__ == '__main__':
+
+    def __test():
+        configuration = {'trg_lang':'de',
+                         'trg_tokenisation_dir':'tmptoktrg',
+                         'moses_installation_dir':os.path.abspath('../../../../')}
+        values = {'trg_filename':'tmp.de'}
+        from pypeline.helpers.helpers import run_pipeline
+        box_config = configure(configuration)
+        box = initialise(configuration)
+        print run_pipeline(box, values, None)
+
+    #do some test
+    __test()
+
--- a/contrib/other-builds/OnDiskPt/.cproject
+++ b/contrib/other-builds/OnDiskPt/.cproject
@ -24,7 +24,7 @@
 					<folderInfo id="cdt.managedbuild.config.gnu.macosx.exe.debug.846397978." name="/" resourcePath="">
 						<toolChain id="cdt.managedbuild.toolchain.gnu.macosx.exe.debug.725420545" name="MacOSX GCC" superClass="cdt.managedbuild.toolchain.gnu.macosx.exe.debug">
 							<targetPlatform binaryParser="org.eclipse.cdt.core.MachO64;org.eclipse.cdt.core.ELF" id="cdt.managedbuild.target.gnu.platform.macosx.exe.debug.1586272140" name="Debug Platform" superClass="cdt.managedbuild.target.gnu.platform.macosx.exe.debug"/>
-							<builder buildPath="${workspace_loc:/OnDiskPt/Debug}" id="cdt.managedbuild.target.gnu.builder.macosx.exe.debug.1909553559" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.target.gnu.builder.macosx.exe.debug"/>
+							<builder buildPath="${workspace_loc:/OnDiskPt/Debug}" id="cdt.managedbuild.target.gnu.builder.macosx.exe.debug.1909553559" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" parallelBuildOn="true" parallelizationNumber="optimal" superClass="cdt.managedbuild.target.gnu.builder.macosx.exe.debug"/>
 							<tool id="cdt.managedbuild.tool.macosx.c.linker.macosx.exe.debug.30521110" name="MacOS X C Linker" superClass="cdt.managedbuild.tool.macosx.c.linker.macosx.exe.debug"/>
 							<tool id="cdt.managedbuild.tool.macosx.cpp.linker.macosx.exe.debug.478334849" name="MacOS X C++ Linker" superClass="cdt.managedbuild.tool.macosx.cpp.linker.macosx.exe.debug">
 								<inputType id="cdt.managedbuild.tool.macosx.cpp.linker.input.1328561226" superClass="cdt.managedbuild.tool.macosx.cpp.linker.input">
@ -133,8 +133,13 @@
 			<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP"/>
 		</scannerConfigBuildInfo>
 	</storageModule>
-	<storageModule moduleId="refreshScope" versionNumber="1">
-		<resource resourceType="PROJECT" workspacePath="/OnDiskPt"/>
+	<storageModule moduleId="refreshScope" versionNumber="2">
+		<configuration configurationName="Release">
+			<resource resourceType="PROJECT" workspacePath="/OnDiskPt"/>
+		</configuration>
+		<configuration configurationName="Debug">
+			<resource resourceType="PROJECT" workspacePath="/OnDiskPt"/>
+		</configuration>
 	</storageModule>
 	<storageModule moduleId="org.eclipse.cdt.make.core.buildtargets"/>
 	<storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/>
--- a/contrib/other-builds/extractor/.cproject
+++ b/contrib/other-builds/extractor/.cproject
@ -18,11 +18,14 @@
 					<folderInfo id="cdt.managedbuild.config.gnu.exe.debug.1133345948." name="/" resourcePath="">
 						<toolChain id="cdt.managedbuild.toolchain.gnu.exe.debug.1405862229" name="Linux GCC" superClass="cdt.managedbuild.toolchain.gnu.exe.debug">
 							<targetPlatform id="cdt.managedbuild.target.gnu.platform.exe.debug.605722566" name="Debug Platform" superClass="cdt.managedbuild.target.gnu.platform.exe.debug"/>
-							<builder buildPath="${workspace_loc:/extractor/Debug}" id="cdt.managedbuild.target.gnu.builder.exe.debug.238577912" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.target.gnu.builder.exe.debug"/>
+							<builder buildPath="${workspace_loc:/extractor/Debug}" id="cdt.managedbuild.target.gnu.builder.exe.debug.238577912" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" parallelBuildOn="true" parallelizationNumber="optimal" superClass="cdt.managedbuild.target.gnu.builder.exe.debug"/>
 							<tool id="cdt.managedbuild.tool.gnu.archiver.base.1956867596" name="GCC Archiver" superClass="cdt.managedbuild.tool.gnu.archiver.base"/>
 							<tool id="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug.1512268277" name="GCC C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug">
 								<option id="gnu.cpp.compiler.exe.debug.option.optimization.level.2143789149" name="Optimization Level" superClass="gnu.cpp.compiler.exe.debug.option.optimization.level" value="gnu.cpp.compiler.optimization.level.none" valueType="enumerated"/>
 								<option id="gnu.cpp.compiler.exe.debug.option.debugging.level.285958391" name="Debug Level" superClass="gnu.cpp.compiler.exe.debug.option.debugging.level" value="gnu.cpp.compiler.debugging.level.max" valueType="enumerated"/>
+								<option id="gnu.cpp.compiler.option.include.paths.966722418" superClass="gnu.cpp.compiler.option.include.paths" valueType="includePath">
+									<listOptionValue builtIn="false" value="&quot;${workspace_loc}/../../boost/include&quot;"/>
+								</option>
 								<inputType id="cdt.managedbuild.tool.gnu.cpp.compiler.input.1839105433" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.input"/>
 							</tool>
 							<tool id="cdt.managedbuild.tool.gnu.c.compiler.exe.debug.554846982" name="GCC C Compiler" superClass="cdt.managedbuild.tool.gnu.c.compiler.exe.debug">
@ -119,5 +122,13 @@
 			<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC"/>
 		</scannerConfigBuildInfo>
 	</storageModule>
-	<storageModule moduleId="refreshScope"/>
+	<storageModule moduleId="refreshScope" versionNumber="2">
+		<configuration configurationName="Release">
+			<resource resourceType="PROJECT" workspacePath="/extractor"/>
+		</configuration>
+		<configuration configurationName="Debug">
+			<resource resourceType="PROJECT" workspacePath="/extractor"/>
+		</configuration>
+	</storageModule>
+	<storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/>
 </cproject>
--- a/contrib/other-builds/lm/.cproject
+++ b/contrib/other-builds/lm/.cproject
@ -24,7 +24,7 @@
 					<folderInfo id="cdt.managedbuild.config.gnu.macosx.exe.debug.351042750." name="/" resourcePath="">
 						<toolChain id="cdt.managedbuild.toolchain.gnu.macosx.exe.debug.640882096" name="MacOSX GCC" superClass="cdt.managedbuild.toolchain.gnu.macosx.exe.debug">
 							<targetPlatform binaryParser="org.eclipse.cdt.core.MachO64;org.eclipse.cdt.core.ELF" id="cdt.managedbuild.target.gnu.platform.macosx.exe.debug.793478365" name="Debug Platform" superClass="cdt.managedbuild.target.gnu.platform.macosx.exe.debug"/>
-							<builder buildPath="${workspace_loc:/lm/Debug}" id="cdt.managedbuild.target.gnu.builder.macosx.exe.debug.36011795" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.target.gnu.builder.macosx.exe.debug"/>
+							<builder buildPath="${workspace_loc:/lm/Debug}" id="cdt.managedbuild.target.gnu.builder.macosx.exe.debug.36011795" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" parallelBuildOn="true" parallelizationNumber="optimal" superClass="cdt.managedbuild.target.gnu.builder.macosx.exe.debug"/>
 							<tool id="cdt.managedbuild.tool.macosx.c.linker.macosx.exe.debug.1252826468" name="MacOS X C Linker" superClass="cdt.managedbuild.tool.macosx.c.linker.macosx.exe.debug"/>
 							<tool id="cdt.managedbuild.tool.macosx.cpp.linker.macosx.exe.debug.1024598065" name="MacOS X C++ Linker" superClass="cdt.managedbuild.tool.macosx.cpp.linker.macosx.exe.debug">
 								<inputType id="cdt.managedbuild.tool.macosx.cpp.linker.input.139111896" superClass="cdt.managedbuild.tool.macosx.cpp.linker.input">
@ -131,7 +131,14 @@
 			<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP"/>
 		</scannerConfigBuildInfo>
 	</storageModule>
-	<storageModule moduleId="refreshScope"/>
+	<storageModule moduleId="refreshScope" versionNumber="2">
+		<configuration configurationName="Release">
+			<resource resourceType="PROJECT" workspacePath="/lm"/>
+		</configuration>
+		<configuration configurationName="Debug">
+			<resource resourceType="PROJECT" workspacePath="/lm"/>
+		</configuration>
+	</storageModule>
 	<storageModule moduleId="org.eclipse.cdt.make.core.buildtargets"/>
 	<storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/>
 </cproject>
--- a/contrib/other-builds/lm/.project
+++ b/contrib/other-builds/lm/.project
@ -141,11 +141,6 @@
 			<type>1</type>
 			<locationURI>PARENT-3-PROJECT_LOC/lm/build_binary</locationURI>
 		</link>
-		<link>
-			<name>build_binary.cc</name>
-			<type>1</type>
-			<locationURI>PARENT-3-PROJECT_LOC/lm/build_binary.cc</locationURI>
-		</link>
 		<link>
 			<name>clean.sh</name>
 			<type>1</type>
@ -176,11 +171,6 @@
 			<type>1</type>
 			<locationURI>PARENT-3-PROJECT_LOC/lm/facade.hh</locationURI>
 		</link>
-		<link>
-			<name>fragment.cc</name>
-			<type>1</type>
-			<locationURI>PARENT-3-PROJECT_LOC/lm/fragment.cc</locationURI>
-		</link>
 		<link>
 			<name>left.hh</name>
 			<type>1</type>
@ -211,11 +201,6 @@
 			<type>1</type>
 			<locationURI>PARENT-3-PROJECT_LOC/lm/lm_exception.hh</locationURI>
 		</link>
-		<link>
-			<name>max_order.cc</name>
-			<type>1</type>
-			<locationURI>PARENT-3-PROJECT_LOC/lm/max_order.cc</locationURI>
-		</link>
 		<link>
 			<name>max_order.hh</name>
 			<type>1</type>
@ -241,11 +226,6 @@
 			<type>1</type>
 			<locationURI>PARENT-3-PROJECT_LOC/lm/model_type.hh</locationURI>
 		</link>
-		<link>
-			<name>ngram_query.cc</name>
-			<type>1</type>
-			<locationURI>PARENT-3-PROJECT_LOC/lm/ngram_query.cc</locationURI>
-		</link>
 		<link>
 			<name>ngram_query.hh</name>
 			<type>1</type>
--- a/contrib/other-builds/mert_lib/.cproject
+++ b/contrib/other-builds/mert_lib/.cproject
@ -7,7 +7,7 @@
 					<externalSetting>
 						<entry flags="VALUE_WORKSPACE_PATH" kind="includePath" name="/mert_lib"/>
 						<entry flags="VALUE_WORKSPACE_PATH" kind="libraryPath" name="/mert_lib/Debug"/>
-						<entry flags="RESOLVED" kind="libraryFile" name="mert_lib"/>
+						<entry flags="RESOLVED" kind="libraryFile" name="mert_lib" srcPrefixMapping="" srcRootPath=""/>
 					</externalSetting>
 				</externalSettings>
 				<extensions>
@ -23,13 +23,14 @@
 					<folderInfo id="cdt.managedbuild.config.gnu.lib.debug.1721952013." name="/" resourcePath="">
 						<toolChain id="cdt.managedbuild.toolchain.gnu.lib.debug.1932340583" name="Linux GCC" superClass="cdt.managedbuild.toolchain.gnu.lib.debug">
 							<targetPlatform id="cdt.managedbuild.target.gnu.platform.lib.debug.296711714" name="Debug Platform" superClass="cdt.managedbuild.target.gnu.platform.lib.debug"/>
-							<builder buildPath="${workspace_loc:/mert_lib/Debug}" id="cdt.managedbuild.target.gnu.builder.lib.debug.1369910974" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.target.gnu.builder.lib.debug"/>
+							<builder buildPath="${workspace_loc:/mert_lib/Debug}" id="cdt.managedbuild.target.gnu.builder.lib.debug.1369910974" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" parallelBuildOn="true" parallelizationNumber="optimal" superClass="cdt.managedbuild.target.gnu.builder.lib.debug"/>
 							<tool id="cdt.managedbuild.tool.gnu.archiver.lib.debug.89397980" name="GCC Archiver" superClass="cdt.managedbuild.tool.gnu.archiver.lib.debug"/>
 							<tool id="cdt.managedbuild.tool.gnu.cpp.compiler.lib.debug.329920537" name="GCC C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.lib.debug">
 								<option id="gnu.cpp.compiler.lib.debug.option.optimization.level.469164841" name="Optimization Level" superClass="gnu.cpp.compiler.lib.debug.option.optimization.level" value="gnu.cpp.compiler.optimization.level.none" valueType="enumerated"/>
 								<option id="gnu.cpp.compiler.lib.debug.option.debugging.level.1050747398" name="Debug Level" superClass="gnu.cpp.compiler.lib.debug.option.debugging.level" value="gnu.cpp.compiler.debugging.level.max" valueType="enumerated"/>
 								<option id="gnu.cpp.compiler.option.include.paths.1565260476" name="Include paths (-I)" superClass="gnu.cpp.compiler.option.include.paths" valueType="includePath">
 									<listOptionValue builtIn="false" value="&quot;${workspace_loc}/../../&quot;"/>
+									<listOptionValue builtIn="false" value="&quot;${workspace_loc}/../../boost/include&quot;"/>
 								</option>
 								<inputType id="cdt.managedbuild.tool.gnu.cpp.compiler.input.1183866856" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.input"/>
 							</tool>
@ -45,11 +46,8 @@
 							</tool>
 						</toolChain>
 					</folderInfo>
-					<fileInfo id="cdt.managedbuild.config.gnu.lib.debug.1721952013.626295813" name="extractor.cpp" rcbsApplicability="disable" resourcePath="mert/extractor.cpp" toolsToInvoke="cdt.managedbuild.tool.gnu.cpp.compiler.lib.debug.329920537.1550378460">
-						<tool id="cdt.managedbuild.tool.gnu.cpp.compiler.lib.debug.329920537.1550378460" name="GCC C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.lib.debug.329920537"/>
-					</fileInfo>
 					<sourceEntries>
-						<entry excluding="mert/extractor.cpp" flags="VALUE_WORKSPACE_PATH|RESOLVED" kind="sourcePath" name=""/>
+						<entry excluding="mert/UtilTest.cpp|mert/TimerTest.cpp|mert/SingletonTest.cpp|mert/PointTest.cpp|mert/OptimizerFactoryTest.cpp|mert/NgramTest.cpp|mert/FeatureDataTest.cpp|mert/DataTest.cpp|mert/ReferenceTest.cpp|mert/VocabularyTest.cpp|mert/extractor.cpp" flags="VALUE_WORKSPACE_PATH|RESOLVED" kind="sourcePath" name=""/>
 					</sourceEntries>
 				</configuration>
 			</storageModule>
@ -61,7 +59,7 @@
 					<externalSetting>
 						<entry flags="VALUE_WORKSPACE_PATH" kind="includePath" name="/mert_lib"/>
 						<entry flags="VALUE_WORKSPACE_PATH" kind="libraryPath" name="/mert_lib/Release"/>
-						<entry flags="RESOLVED" kind="libraryFile" name="mert_lib"/>
+						<entry flags="RESOLVED" kind="libraryFile" name="mert_lib" srcPrefixMapping="" srcRootPath=""/>
 					</externalSetting>
 				</externalSettings>
 				<extensions>
@ -119,5 +117,13 @@
 			<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC"/>
 		</scannerConfigBuildInfo>
 	</storageModule>
-	<storageModule moduleId="refreshScope"/>
+	<storageModule moduleId="refreshScope" versionNumber="2">
+		<configuration configurationName="Release">
+			<resource resourceType="PROJECT" workspacePath="/mert_lib"/>
+		</configuration>
+		<configuration configurationName="Debug">
+			<resource resourceType="PROJECT" workspacePath="/mert_lib"/>
+		</configuration>
+	</storageModule>
+	<storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/>
 </cproject>
--- a/contrib/other-builds/moses-chart-cmd/.cproject
+++ b/contrib/other-builds/moses-chart-cmd/.cproject
@ -19,7 +19,7 @@
 					<folderInfo id="cdt.managedbuild.config.gnu.exe.debug.162355801." name="/" resourcePath="">
 						<toolChain id="cdt.managedbuild.toolchain.gnu.exe.debug.1633424067" name="Linux GCC" superClass="cdt.managedbuild.toolchain.gnu.exe.debug">
 							<targetPlatform binaryParser="org.eclipse.cdt.core.ELF;org.eclipse.cdt.core.MachO64" id="cdt.managedbuild.target.gnu.platform.exe.debug.1437309068" name="Debug Platform" superClass="cdt.managedbuild.target.gnu.platform.exe.debug"/>
-							<builder buildPath="${workspace_loc:/moses-chart-cmd/Debug}" id="cdt.managedbuild.target.gnu.builder.exe.debug.1495140314" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.target.gnu.builder.exe.debug"/>
+							<builder buildPath="${workspace_loc:/moses-chart-cmd/Debug}" id="cdt.managedbuild.target.gnu.builder.exe.debug.1495140314" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" parallelBuildOn="true" parallelizationNumber="optimal" superClass="cdt.managedbuild.target.gnu.builder.exe.debug"/>
 							<tool id="cdt.managedbuild.tool.gnu.archiver.base.1247128100" name="GCC Archiver" superClass="cdt.managedbuild.tool.gnu.archiver.base"/>
 							<tool id="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug.1087697480" name="GCC C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug">
 								<option id="gnu.cpp.compiler.exe.debug.option.optimization.level.1163099464" name="Optimization Level" superClass="gnu.cpp.compiler.exe.debug.option.optimization.level" value="gnu.cpp.compiler.optimization.level.none" valueType="enumerated"/>
@ -46,6 +46,7 @@
 							<tool id="cdt.managedbuild.tool.gnu.cpp.linker.exe.debug.816413868" name="GCC C++ Linker" superClass="cdt.managedbuild.tool.gnu.cpp.linker.exe.debug">
 								<option id="gnu.cpp.link.option.paths.330225535" name="Library search path (-L)" superClass="gnu.cpp.link.option.paths" valueType="libPaths">
 									<listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../boost/lib&quot;"/>
+									<listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../boost/lib64&quot;"/>
 									<listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../irstlm/lib&quot;"/>
 									<listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../srilm/lib/macosx&quot;"/>
 									<listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../srilm/lib/i686-m64&quot;"/>
@ -154,8 +155,13 @@
 			<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP"/>
 		</scannerConfigBuildInfo>
 	</storageModule>
-	<storageModule moduleId="refreshScope" versionNumber="1">
-		<resource resourceType="PROJECT" workspacePath="/moses-chart-cmd"/>
+	<storageModule moduleId="refreshScope" versionNumber="2">
+		<configuration configurationName="Release">
+			<resource resourceType="PROJECT" workspacePath="/moses-chart-cmd"/>
+		</configuration>
+		<configuration configurationName="Debug">
+			<resource resourceType="PROJECT" workspacePath="/moses-chart-cmd"/>
+		</configuration>
 	</storageModule>
 	<storageModule moduleId="org.eclipse.cdt.make.core.buildtargets"/>
 	<storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/>
--- a/contrib/other-builds/moses-cmd/.cproject
+++ b/contrib/other-builds/moses-cmd/.cproject
@ -19,7 +19,7 @@
 					<folderInfo id="cdt.managedbuild.config.gnu.exe.debug.461114338." name="/" resourcePath="">
 						<toolChain id="cdt.managedbuild.toolchain.gnu.exe.debug.1896491482" name="Linux GCC" superClass="cdt.managedbuild.toolchain.gnu.exe.debug">
 							<targetPlatform binaryParser="org.eclipse.cdt.core.ELF;org.eclipse.cdt.core.MachO64" id="cdt.managedbuild.target.gnu.platform.exe.debug.2144309834" name="Debug Platform" superClass="cdt.managedbuild.target.gnu.platform.exe.debug"/>
-							<builder buildPath="${workspace_loc:/moses-cmd/Debug}" id="cdt.managedbuild.target.gnu.builder.exe.debug.56664170" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.target.gnu.builder.exe.debug"/>
+							<builder buildPath="${workspace_loc:/moses-cmd/Debug}" id="cdt.managedbuild.target.gnu.builder.exe.debug.56664170" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" parallelBuildOn="true" parallelizationNumber="optimal" superClass="cdt.managedbuild.target.gnu.builder.exe.debug"/>
 							<tool id="cdt.managedbuild.tool.gnu.archiver.base.1278274354" name="GCC Archiver" superClass="cdt.managedbuild.tool.gnu.archiver.base"/>
 							<tool id="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug.626095182" name="GCC C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug">
 								<option id="gnu.cpp.compiler.exe.debug.option.optimization.level.2084031389" name="Optimization Level" superClass="gnu.cpp.compiler.exe.debug.option.optimization.level" value="gnu.cpp.compiler.optimization.level.none" valueType="enumerated"/>
@ -46,6 +46,8 @@
 							<tool id="cdt.managedbuild.tool.gnu.cpp.linker.exe.debug.1546774818" name="GCC C++ Linker" superClass="cdt.managedbuild.tool.gnu.cpp.linker.exe.debug">
 								<option id="gnu.cpp.link.option.paths.523170942" name="Library search path (-L)" superClass="gnu.cpp.link.option.paths" valueType="libPaths">
 									<listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../irstlm/lib&quot;"/>
+									<listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../boost/lib&quot;"/>
+									<listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../boost/lib64&quot;"/>
 									<listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../srilm/lib/macosx&quot;"/>
 									<listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../srilm/lib/i686-m64&quot;"/>
 									<listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../srilm/lib/i686&quot;"/>
@ -155,8 +157,13 @@
 			<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP"/>
 		</scannerConfigBuildInfo>
 	</storageModule>
-	<storageModule moduleId="refreshScope" versionNumber="1">
-		<resource resourceType="PROJECT" workspacePath="/moses-cmd"/>
+	<storageModule moduleId="refreshScope" versionNumber="2">
+		<configuration configurationName="Release">
+			<resource resourceType="PROJECT" workspacePath="/moses-cmd"/>
+		</configuration>
+		<configuration configurationName="Debug">
+			<resource resourceType="PROJECT" workspacePath="/moses-cmd"/>
+		</configuration>
 	</storageModule>
 	<storageModule moduleId="org.eclipse.cdt.make.core.buildtargets"/>
 	<storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/>
--- a/contrib/other-builds/moses/.cproject
+++ b/contrib/other-builds/moses/.cproject
@ -1,7 +1,5 @@
 <?xml version="1.0" encoding="UTF-8" standalone="no"?>
-<?fileVersion 4.0.0?>
-
-<cproject storage_type_id="org.eclipse.cdt.core.XmlProjectDescriptionStorage">
+<?fileVersion 4.0.0?><cproject storage_type_id="org.eclipse.cdt.core.XmlProjectDescriptionStorage">
 	<storageModule moduleId="org.eclipse.cdt.core.settings">
 		<cconfiguration id="cdt.managedbuild.config.gnu.exe.debug.656913512">
 			<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.exe.debug.656913512" moduleId="org.eclipse.cdt.core.settings" name="Debug">
@ -9,7 +7,7 @@
 					<externalSetting>
 						<entry flags="VALUE_WORKSPACE_PATH" kind="includePath" name="/moses"/>
 						<entry flags="VALUE_WORKSPACE_PATH" kind="libraryPath" name="/moses/Debug"/>
-						<entry flags="RESOLVED" kind="libraryFile" name="moses"/>
+						<entry flags="RESOLVED" kind="libraryFile" name="moses" srcPrefixMapping="" srcRootPath=""/>
 					</externalSetting>
 				</externalSettings>
 				<extensions>
@ -26,7 +24,7 @@
 					<folderInfo id="cdt.managedbuild.config.gnu.exe.debug.656913512." name="/" resourcePath="">
 						<toolChain id="cdt.managedbuild.toolchain.gnu.exe.debug.1793369992" name="Linux GCC" superClass="cdt.managedbuild.toolchain.gnu.exe.debug">
 							<targetPlatform binaryParser="org.eclipse.cdt.core.ELF;org.eclipse.cdt.core.MachO64" id="cdt.managedbuild.target.gnu.platform.exe.debug.1051650049" name="Debug Platform" superClass="cdt.managedbuild.target.gnu.platform.exe.debug"/>
-							<builder buildPath="${workspace_loc:/moses/Debug}" id="cdt.managedbuild.target.gnu.builder.exe.debug.505583888" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.target.gnu.builder.exe.debug"/>
+							<builder buildPath="${workspace_loc:/moses/Debug}" id="cdt.managedbuild.target.gnu.builder.exe.debug.505583888" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" parallelBuildOn="true" parallelizationNumber="optimal" superClass="cdt.managedbuild.target.gnu.builder.exe.debug"/>
 							<tool id="cdt.managedbuild.tool.gnu.archiver.base.1976472988" name="GCC Archiver" superClass="cdt.managedbuild.tool.gnu.archiver.base"/>
 							<tool id="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug.1774992327" name="GCC C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug">
 								<option id="gnu.cpp.compiler.exe.debug.option.optimization.level.1759650532" name="Optimization Level" superClass="gnu.cpp.compiler.exe.debug.option.optimization.level" value="gnu.cpp.compiler.optimization.level.none" valueType="enumerated"/>
@ -152,8 +150,14 @@
 			<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP"/>
 		</scannerConfigBuildInfo>
 	</storageModule>
-	<storageModule moduleId="refreshScope" versionNumber="1">
-		<resource resourceType="PROJECT" workspacePath="/moses"/>
+	<storageModule moduleId="refreshScope" versionNumber="2">
+		<configuration configurationName="Release">
+			<resource resourceType="PROJECT" workspacePath="/moses"/>
+		</configuration>
+		<configuration configurationName="Debug">
+			<resource resourceType="PROJECT" workspacePath="/moses"/>
+		</configuration>
 	</storageModule>
 	<storageModule moduleId="org.eclipse.cdt.make.core.buildtargets"/>
+	<storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/>
 </cproject>
--- a/contrib/other-builds/search/.cproject
+++ b/contrib/other-builds/search/.cproject
@ -24,7 +24,7 @@
 					<folderInfo id="cdt.managedbuild.config.gnu.exe.debug.722547278." name="/" resourcePath="">
 						<toolChain id="cdt.managedbuild.toolchain.gnu.exe.debug.1512691763" name="Linux GCC" superClass="cdt.managedbuild.toolchain.gnu.exe.debug">
 							<targetPlatform binaryParser="org.eclipse.cdt.core.ELF;org.eclipse.cdt.core.MachO64" id="cdt.managedbuild.target.gnu.platform.exe.debug.633526059" name="Debug Platform" superClass="cdt.managedbuild.target.gnu.platform.exe.debug"/>
-							<builder buildPath="${workspace_loc:/search/Debug}" id="cdt.managedbuild.target.gnu.builder.exe.debug.164367197" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.target.gnu.builder.exe.debug"/>
+							<builder buildPath="${workspace_loc:/search/Debug}" id="cdt.managedbuild.target.gnu.builder.exe.debug.164367197" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" parallelBuildOn="true" parallelizationNumber="optimal" superClass="cdt.managedbuild.target.gnu.builder.exe.debug"/>
 							<tool id="cdt.managedbuild.tool.gnu.archiver.base.854512708" name="GCC Archiver" superClass="cdt.managedbuild.tool.gnu.archiver.base"/>
 							<tool id="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug.1096845166" name="GCC C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug">
 								<option id="gnu.cpp.compiler.exe.debug.option.optimization.level.240381177" name="Optimization Level" superClass="gnu.cpp.compiler.exe.debug.option.optimization.level" value="gnu.cpp.compiler.optimization.level.none" valueType="enumerated"/>
@ -127,6 +127,13 @@
 			<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC"/>
 		</scannerConfigBuildInfo>
 	</storageModule>
-	<storageModule moduleId="refreshScope"/>
+	<storageModule moduleId="refreshScope" versionNumber="2">
+		<configuration configurationName="Release">
+			<resource resourceType="PROJECT" workspacePath="/search"/>
+		</configuration>
+		<configuration configurationName="Debug">
+			<resource resourceType="PROJECT" workspacePath="/search"/>
+		</configuration>
+	</storageModule>
 	<storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/>
 </cproject>
--- a/contrib/other-builds/search/.project
+++ b/contrib/other-builds/search/.project
@ -156,11 +156,6 @@
 			<type>1</type>
 			<locationURI>PARENT-3-PROJECT_LOC/search/vertex.hh</locationURI>
 		</link>
-		<link>
-			<name>vertex_generator.cc</name>
-			<type>1</type>
-			<locationURI>PARENT-3-PROJECT_LOC/search/vertex_generator.cc</locationURI>
-		</link>
 		<link>
 			<name>vertex_generator.hh</name>
 			<type>1</type>
--- a/contrib/other-builds/util/.cproject
+++ b/contrib/other-builds/util/.cproject
@ -24,7 +24,7 @@
 					<folderInfo id="cdt.managedbuild.config.gnu.macosx.exe.debug.1869657447." name="/" resourcePath="">
 						<toolChain id="cdt.managedbuild.toolchain.gnu.macosx.exe.debug.1388624938" name="MacOSX GCC" superClass="cdt.managedbuild.toolchain.gnu.macosx.exe.debug">
 							<targetPlatform binaryParser="org.eclipse.cdt.core.MachO64;org.eclipse.cdt.core.ELF" id="cdt.managedbuild.target.gnu.platform.macosx.exe.debug.1873607607" name="Debug Platform" superClass="cdt.managedbuild.target.gnu.platform.macosx.exe.debug"/>
-							<builder buildPath="${workspace_loc:/util/Debug}" id="cdt.managedbuild.target.gnu.builder.macosx.exe.debug.2045214944" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.target.gnu.builder.macosx.exe.debug"/>
+							<builder buildPath="${workspace_loc:/util/Debug}" id="cdt.managedbuild.target.gnu.builder.macosx.exe.debug.2045214944" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" parallelBuildOn="true" parallelizationNumber="optimal" superClass="cdt.managedbuild.target.gnu.builder.macosx.exe.debug"/>
 							<tool id="cdt.managedbuild.tool.macosx.c.linker.macosx.exe.debug.589471640" name="MacOS X C Linker" superClass="cdt.managedbuild.tool.macosx.c.linker.macosx.exe.debug"/>
 							<tool id="cdt.managedbuild.tool.macosx.cpp.linker.macosx.exe.debug.1543780089" name="MacOS X C++ Linker" superClass="cdt.managedbuild.tool.macosx.cpp.linker.macosx.exe.debug">
 								<inputType id="cdt.managedbuild.tool.macosx.cpp.linker.input.635667684" superClass="cdt.managedbuild.tool.macosx.cpp.linker.input">
@ -136,8 +136,13 @@
 			<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC"/>
 		</scannerConfigBuildInfo>
 	</storageModule>
-	<storageModule moduleId="refreshScope" versionNumber="1">
-		<resource resourceType="PROJECT" workspacePath="/util"/>
+	<storageModule moduleId="refreshScope" versionNumber="2">
+		<configuration configurationName="Release">
+			<resource resourceType="PROJECT" workspacePath="/util"/>
+		</configuration>
+		<configuration configurationName="Debug">
+			<resource resourceType="PROJECT" workspacePath="/util"/>
+		</configuration>
 	</storageModule>
 	<storageModule moduleId="org.eclipse.cdt.make.core.buildtargets"/>
 	<storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/>
--- a/contrib/rpm/README
+++ b/contrib/rpm/README
@ -0,0 +1,42 @@
+Building Moses RPM
+==================
+
+*** WARNING ***
+Before completing *any* of the tasks outlined in this README, please commit and push any changes you wish to be included in your installer.
+*** WARNING ***
+
+
+Building the RPM SPEC file
+--------------------------
+
+The first phase is to construct the RPM SPEC file in $HOME/rpmbuild. The build_source.sh script builds all the artefacts needed to build. This script needs the following information:
+
+ - The Git repository from which an installer will be built,
+ - The branch in the Git repository to build, and
+ - The version of the installed Moses distribution.
+
+For example, to build the RELEASE-1.0 branch in the mosesdecode repository (git://github.com/moses-smt/mosesdecoder.git):
+
+$ build_source.sh -r git://github.com/moses-smt/mosesdecoder.git -b RELASE-1.0 -v 1.0
+
+This builds the source tarballs in the $HOME/rpmbuild/SOURCES directory and the moses.spec file in $HOME/rpmbuild/SPECS.
+
+
+Building the RPM
+----------------
+
+Change directory to $HOME/rpmbuild, and build the binary RPM with:
+
+$ rpmbuild -bb SPECS/moses.spec
+
+This will download IRSTLM v5.70.04 and GIZA++ v2, then build them along with Moses and make the RPM in the directory $HOME/rpmbuild/RPMS/<architecture>/moses-<version>-1.<architecture>.rpm.
+
+For example building on a 64 bit Intel architecture, and building v1.0 the RPM would be called moses-1.0-1.x86_64.rpm.
+
+
+Building a Debian package
+-------------------------
+
+The Alien tool converts RPM packages to Debian packages. If a Debian package is required then follow the instructions on the following web-page:
+
+https://help.ubuntu.com/community/RPM/AlienHowto
--- a/contrib/rpm/build_source.sh
+++ b/contrib/rpm/build_source.sh
@ -0,0 +1,63 @@
+#!/bin/bash
+
+BRANCH="master"
+declare -i NO_RPM_BUILD=0
+declare -r RPM_VERSION_TAG="___RPM_VERSION__"
+
+function usage() {
+  echo "`basename $0` -r [Moses Git repo] -b [Moses Git branch: default ${BRANCH}] -v [RPM version]"
+  exit 1
+}
+
+if [ $# -lt 4 ]; then
+  usage
+fi
+
+while getopts r:b:v:nh OPTION
+do
+  case "$OPTION" in
+      r) REPO="${OPTARG}";;
+      b) BRANCH="${OPTARG}";;
+      v) VERSION="${OPTARG}";;
+      n) NO_RPM_BUILD=1;;
+      [h\?]) usage;;
+  esac
+done
+
+if [ ! -d ./rpmbuild ]; then
+  echo "RPM build directory not in current working direcotry"
+  exit 1
+fi
+
+declare -r MOSES_DIR="moses-${VERSION}"
+git clone ${REPO} ${MOSES_DIR}
+if [ $? -ne 0 ]; then
+  echo "Failed to clone Git repository ${REPO}"
+  exit 3
+fi
+
+cd ${MOSES_DIR}
+
+git checkout ${BRANCH}
+if [ $? -ne 0 ]; then
+  echo "Failed to checkout branch ${BRANCH}"
+  exit 3
+fi
+
+cd ..
+
+tar -cf moses-${VERSION}.tar ${MOSES_DIR}
+gzip -f9 moses-${VERSION}.tar
+
+if [ ${NO_RPM_BUILD} -eq 0 ]; then
+  if [ ! -d ${HOME}/rpmbuild/SPECS ]; then
+    mkdir -p ${HOME}/rpmbuild/SPECS
+  fi
+  eval sed s/${RPM_VERSION_TAG}/${VERSION}/ ./rpmbuild/SPECS/moses.spec > ${HOME}/rpmbuild/SPECS/moses.spec
+  if [ ! -d ${HOME}/rpmbuild/SOURCES ]; then
+    mkdir -p ${HOME}/rpmbuild/SOURCES
+  fi
+  mv moses-${VERSION}.tar.gz ${HOME}/rpmbuild/SOURCES
+fi
+
+rm -Rf ${MOSES_DIR}
--- a/contrib/rpm/rpmbuild/SPECS/moses.spec
+++ b/contrib/rpm/rpmbuild/SPECS/moses.spec
@ -0,0 +1,65 @@
+Name: moses
+Summary: Moses is a statistical machine translation system that allows you to automatically train translation models for any language pair.
+Version: ___RPM_VERSION__
+Release: 1
+URL: http://www.statmt.org/moses/
+Source0: %{name}-%{version}.tar.gz
+License: LGPL
+Group: Development/Tools
+Vendor: Capita Translation and Interpreting
+Packager: Ian Johnson <ian.johnson@capita-ti.com>
+Requires: boost >= 1.48, python >= 2.6, perl >= 5
+BuildRoot: /home/ian/rpmbuild/builds/%{name}-%{version}-%{release}
+%description
+Moses is a statistical machine translation system that allows you to automatically train translation models for any language pair. All you need is a collection of translated texts (parallel corpus). An efficient search algorithm finds quickly the highest probability translation among the exponential number of choices.
+%prep
+%setup -q
+
+mkdir -p $RPM_BUILD_ROOT/opt/moses/giza++-v1.0.7
+
+wget -O $RPM_BUILD_DIR/irstlm-5.70.04.tgz http://moses-suite.googlecode.com/files/irstlm-5.70.04.tgz 
+wget -O $RPM_BUILD_DIR/giza-pp-v1.0.7.tgz http://moses-suite.googlecode.com/files/giza-pp-v1.0.7.tar.gz
+
+cd $RPM_BUILD_DIR
+
+tar -zxf irstlm-5.70.04.tgz
+tar -zxf giza-pp-v1.0.7.tgz
+
+cd irstlm-5.70.04
+bash regenerate-makefiles.sh --force
+./configure --prefix $RPM_BUILD_ROOT/opt/moses/irstlm-5.70.04
+make
+make install
+
+cd ../giza-pp
+make
+cp $RPM_BUILD_DIR/giza-pp/GIZA++-v2/GIZA++ $RPM_BUILD_DIR/giza-pp/GIZA++-v2/snt2cooc.out $RPM_BUILD_DIR/giza-pp/mkcls-v2/mkcls $RPM_BUILD_ROOT/opt/moses/giza++-v1.0.7
+%build
+./bjam --with-irstlm=$RPM_BUILD_ROOT/opt/moses/irstlm-5.70.04 --with-giza=$RPM_BUILD_ROOT/opt/moses/giza++-v1.0.7 -j2
+%install
+mkdir -p $RPM_BUILD_ROOT/opt/moses/scripts
+cp -R bin $RPM_BUILD_ROOT/opt/moses
+cp -R scripts/analysis $RPM_BUILD_ROOT/opt/moses/scripts
+cp -R scripts/ems $RPM_BUILD_ROOT/opt/moses/scripts
+cp -R scripts/generic $RPM_BUILD_ROOT/opt/moses/scripts
+cp -R scripts/other $RPM_BUILD_ROOT/opt/moses/scripts
+cp -R scripts/recaser $RPM_BUILD_ROOT/opt/moses/scripts
+cp -R scripts/regression-testing $RPM_BUILD_ROOT/opt/moses/scripts
+cp -R scripts/share $RPM_BUILD_ROOT/opt/moses/scripts
+cp -R scripts/tokenizer $RPM_BUILD_ROOT/opt/moses/scripts
+cp -R scripts/training $RPM_BUILD_ROOT/opt/moses/scripts
+%clean
+%files
+%defattr(-,root,root)
+/opt/moses/bin/*
+/opt/moses/scripts/analysis/*
+/opt/moses/scripts/ems/*
+/opt/moses/scripts/generic/*
+/opt/moses/scripts/other/*
+/opt/moses/scripts/recaser/*
+/opt/moses/scripts/regression-testing/*
+/opt/moses/scripts/share/*
+/opt/moses/scripts/tokenizer/*
+/opt/moses/scripts/training/*
+/opt/moses/irstlm-5.70.04/*
+/opt/moses/giza++-v1.0.7/*
--- a/moses-chart-cmd/IOWrapper.cpp
+++ b/moses-chart-cmd/IOWrapper.cpp
@ -620,12 +620,29 @@ void IOWrapper::FixPrecision(std::ostream &stream, size_t size)
 template <class T>
 void ShiftOffsets(vector<T> &offsets, T shift)
 {
+  T currPos = shift;
  for (size_t i = 0; i < offsets.size(); ++i) {
-    shift += offsets[i];
-    offsets[i] += shift;
+    if (offsets[i] == 0) {
+	  offsets[i] = currPos;
+	  ++currPos;
+	}
+	else {
+	  currPos += offsets[i];
+	}
  }
 }

+size_t CalcSourceSize(const Moses::ChartHypothesis *hypo)
+{
+  size_t ret = hypo->GetCurrSourceRange().GetNumWordsCovered();
+  const std::vector<const ChartHypothesis*> &prevHypos = hypo->GetPrevHypos();
+  for (size_t i = 0; i < prevHypos.size(); ++i) {
+    size_t childSize = prevHypos[i]->GetCurrSourceRange().GetNumWordsCovered();
+    ret -= (childSize - 1);
+  }
+  return ret;
+}
+
 size_t IOWrapper::OutputAlignmentNBest(Alignments &retAlign, const Moses::ChartTrellisNode &node, size_t startTarget)
 {
  const ChartHypothesis *hypo = &node.GetHypothesis();
@ -635,7 +652,11 @@ size_t IOWrapper::OutputAlignmentNBest(Alignments &retAlign, const Moses::ChartT

  const TargetPhrase &tp = hypo->GetCurrTargetPhrase();

-  vector<size_t> sourceOffsets(hypo->GetCurrSourceRange().GetNumWordsCovered(), 0);
+  size_t thisSourceSize = CalcSourceSize(hypo);
+
+  // position of each terminal word in translation rule, irrespective of alignment
+  // if non-term, number is undefined
+  vector<size_t> sourceOffsets(thisSourceSize, 0);
  vector<size_t> targetOffsets(tp.GetSize(), 0);

  const ChartTrellisNode::NodeChildren &prevNodes = node.GetChildren();
@ -655,11 +676,12 @@ size_t IOWrapper::OutputAlignmentNBest(Alignments &retAlign, const Moses::ChartT

      const ChartTrellisNode &prevNode = *prevNodes[sourceInd];

-      // 1st. calc source size
+      // calc source size
      size_t sourceSize = prevNode.GetHypothesis().GetCurrSourceRange().GetNumWordsCovered();
      sourceOffsets[sourcePos] = sourceSize;

-      // 2nd. calc target size. Recursively look thru child hypos
+      // calc target size.
+      // Recursively look thru child hypos
      size_t currStartTarget = startTarget + totalTargetSize;
      size_t targetSize = OutputAlignmentNBest(retAlign, prevNode, currStartTarget);
      targetOffsets[targetPos] = targetSize;
@ -672,27 +694,26 @@ size_t IOWrapper::OutputAlignmentNBest(Alignments &retAlign, const Moses::ChartT
    }
  }

-  // 3rd. shift offsets
+  // convert position within translation rule to absolute position within
+  // source sentence / output sentence
  ShiftOffsets(sourceOffsets, startSource);
  ShiftOffsets(targetOffsets, startTarget);

  // get alignments from this hypo
-  vector< set<size_t> > retAlignmentsS2T(hypo->GetCurrSourceRange().GetNumWordsCovered());
  const AlignmentInfo &aiTerm = hypo->GetCurrTargetPhrase().GetAlignTerm();
-  OutputAlignment(retAlignmentsS2T, aiTerm);

  // add to output arg, offsetting by source & target
-  for (size_t source = 0; source < retAlignmentsS2T.size(); ++source) {
-    const set<size_t> &targets = retAlignmentsS2T[source];
-    set<size_t>::const_iterator iter;
-    for (iter = targets.begin(); iter != targets.end(); ++iter) {
-      size_t target = *iter;
-      pair<size_t, size_t> alignPoint(source + sourceOffsets[source]
-                                     ,target + targetOffsets[target]);
-      pair<Alignments::iterator, bool> ret = retAlign.insert(alignPoint);
-      CHECK(ret.second);
+  AlignmentInfo::const_iterator iter;
+  for (iter = aiTerm.begin(); iter != aiTerm.end(); ++iter) {
+    const std::pair<size_t,size_t> &align = *iter;
+    size_t relSource = align.first;
+    size_t relTarget = align.second;
+    size_t absSource = sourceOffsets[relSource];
+    size_t absTarget = targetOffsets[relTarget];

-    }
+    pair<size_t, size_t> alignPoint(absSource, absTarget);
+    pair<Alignments::iterator, bool> ret = retAlign.insert(alignPoint);
+    CHECK(ret.second);
  }

  return totalTargetSize;
@ -702,14 +723,16 @@ void IOWrapper::OutputAlignment(size_t translationId , const Moses::ChartHypothe
 {
  ostringstream out;

-  Alignments retAlign;
-  OutputAlignment(retAlign, hypo, 0);
+  if (hypo) {
+	Alignments retAlign;
+	OutputAlignment(retAlign, hypo, 0);

-  // output alignments
-  Alignments::const_iterator iter;
-  for (iter = retAlign.begin(); iter != retAlign.end(); ++iter) {
-    const pair<size_t, size_t> &alignPoint = *iter;
-    out << alignPoint.first << "-" << alignPoint.second << " ";
+	// output alignments
+	Alignments::const_iterator iter;
+	for (iter = retAlign.begin(); iter != retAlign.end(); ++iter) {
+	  const pair<size_t, size_t> &alignPoint = *iter;
+	  out << alignPoint.first << "-" << alignPoint.second << " ";
+	}
  }
  out << endl;

@ -723,7 +746,11 @@ size_t IOWrapper::OutputAlignment(Alignments &retAlign, const Moses::ChartHypoth

  const TargetPhrase &tp = hypo->GetCurrTargetPhrase();

-  vector<size_t> sourceOffsets(hypo->GetCurrSourceRange().GetNumWordsCovered(), 0);
+  size_t thisSourceSize = CalcSourceSize(hypo);
+
+  // position of each terminal word in translation rule, irrespective of alignment
+  // if non-term, number is undefined
+  vector<size_t> sourceOffsets(thisSourceSize, 0);
  vector<size_t> targetOffsets(tp.GetSize(), 0);

  const vector<const ChartHypothesis*> &prevHypos = hypo->GetPrevHypos();
@ -743,11 +770,12 @@ size_t IOWrapper::OutputAlignment(Alignments &retAlign, const Moses::ChartHypoth

      const ChartHypothesis *prevHypo = prevHypos[sourceInd];

-      // 1st. calc source size
+      // calc source size
      size_t sourceSize = prevHypo->GetCurrSourceRange().GetNumWordsCovered();
      sourceOffsets[sourcePos] = sourceSize;

-      // 2nd. calc target size. Recursively look thru child hypos
+      // calc target size.
+      // Recursively look thru child hypos
      size_t currStartTarget = startTarget + totalTargetSize;
      size_t targetSize = OutputAlignment(retAlign, prevHypo, currStartTarget);
      targetOffsets[targetPos] = targetSize;
@ -760,27 +788,27 @@ size_t IOWrapper::OutputAlignment(Alignments &retAlign, const Moses::ChartHypoth
    }
  }

-  // 3rd. shift offsets
+  // convert position within translation rule to absolute position within
+  // source sentence / output sentence
  ShiftOffsets(sourceOffsets, startSource);
  ShiftOffsets(targetOffsets, startTarget);

  // get alignments from this hypo
-  vector< set<size_t> > retAlignmentsS2T(hypo->GetCurrSourceRange().GetNumWordsCovered());
  const AlignmentInfo &aiTerm = hypo->GetCurrTargetPhrase().GetAlignTerm();
-  OutputAlignment(retAlignmentsS2T, aiTerm);

  // add to output arg, offsetting by source & target
-  for (size_t source = 0; source < retAlignmentsS2T.size(); ++source) {
-    const set<size_t> &targets = retAlignmentsS2T[source];
-    set<size_t>::const_iterator iter;
-    for (iter = targets.begin(); iter != targets.end(); ++iter) {
-      size_t target = *iter;
-      pair<size_t, size_t> alignPoint(source + sourceOffsets[source]
-                                     ,target + targetOffsets[target]);
-      pair<Alignments::iterator, bool> ret = retAlign.insert(alignPoint);
-      CHECK(ret.second);
+  AlignmentInfo::const_iterator iter;
+  for (iter = aiTerm.begin(); iter != aiTerm.end(); ++iter) {
+    const std::pair<size_t,size_t> &align = *iter;
+    size_t relSource = align.first;
+    size_t relTarget = align.second;
+    size_t absSource = sourceOffsets[relSource];
+    size_t absTarget = targetOffsets[relTarget];
+
+    pair<size_t, size_t> alignPoint(absSource, absTarget);
+    pair<Alignments::iterator, bool> ret = retAlign.insert(alignPoint);
+    CHECK(ret.second);

-    }
  }

  return totalTargetSize;
--- a/moses-cmd/IOWrapper.cpp
+++ b/moses-cmd/IOWrapper.cpp
@ -189,6 +189,15 @@ InputType*IOWrapper::GetInput(InputType* inputType)
  }
 }

+  ofstream* IOWrapper::GetOutputSearchGraphHypergraphWeightsStream() {
+    const StaticData &staticData = StaticData::Instance();
+    stringstream fileName;
+    fileName << staticData.GetParam("output-search-graph-hypergraph")[1];
+    std::ofstream *file = new std::ofstream;
+    file->open(fileName.str().c_str());
+    return file;
+  }
+
 /***
 * print surface factor only for the given phrase
 */
@ -262,6 +271,19 @@ void OutputAlignment(ostream &out, const vector<const Hypothesis *> &edges)
  out << std::endl;
 }

+void OutputAlignment(std::ostream &out, const Moses::Hypothesis *hypo)
+{
+  std::vector<const Hypothesis *> edges;
+  const Hypothesis *currentHypo = hypo;
+  while (currentHypo) {
+    edges.push_back(currentHypo);
+    currentHypo = currentHypo->GetPrevHypo();
+  }
+
+  OutputAlignment(out, edges);
+
+}
+
 void OutputAlignment(OutputCollector* collector, size_t lineNo , const vector<const Hypothesis *> &edges)
 {
  ostringstream out;
--- a/moses-cmd/IOWrapper.h
+++ b/moses-cmd/IOWrapper.h
@ -117,6 +117,8 @@ public:
    return *m_outputSearchGraphStream;
  }

+  std::ofstream *GetOutputSearchGraphHypergraphWeightsStream();
+
  std::ostream &GetDetailedTranslationReportingStream() {
    assert (m_detailedTranslationReportingStream);
    return *m_detailedTranslationReportingStream;
@ -137,7 +139,7 @@ void OutputBestHypo(const Moses::TrellisPath &path, long /*translationId*/,bool
 void OutputInput(std::ostream& os, const Moses::Hypothesis* hypo);
 void OutputAlignment(Moses::OutputCollector* collector, size_t lineNo, const Moses::Hypothesis *hypo);
 void OutputAlignment(Moses::OutputCollector* collector, size_t lineNo,  const Moses::TrellisPath &path);
-
+void OutputAlignment(std::ostream &out, const Moses::Hypothesis *hypo);

 }

--- a/moses-cmd/Main.cpp
+++ b/moses-cmd/Main.cpp
@ -83,14 +83,18 @@ public:
                  OutputCollector* wordGraphCollector, OutputCollector* searchGraphCollector,
                  OutputCollector* detailedTranslationCollector,
                  OutputCollector* alignmentInfoCollector,
-                  OutputCollector* unknownsCollector) :
+                  OutputCollector* unknownsCollector,
+                  bool outputSearchGraphSLF,
+		  bool outputSearchGraphHypergraph) :
    m_source(source), m_lineNumber(lineNumber),
    m_outputCollector(outputCollector), m_nbestCollector(nbestCollector),
    m_latticeSamplesCollector(latticeSamplesCollector),
    m_wordGraphCollector(wordGraphCollector), m_searchGraphCollector(searchGraphCollector),
    m_detailedTranslationCollector(detailedTranslationCollector),
    m_alignmentInfoCollector(alignmentInfoCollector),
-    m_unknownsCollector(unknownsCollector) {}
+    m_unknownsCollector(unknownsCollector),
+    m_outputSearchGraphSLF(outputSearchGraphSLF),
+    m_outputSearchGraphHypergraph(outputSearchGraphHypergraph) {}

 	/** Translate one sentence
   * gets called by main function implemented at end of this source file */
@ -143,6 +147,42 @@ public:
 #endif
    }		

+    // Output search graph in HTK standard lattice format (SLF)
+    if (m_outputSearchGraphSLF) {
+      stringstream fileName;
+      fileName << staticData.GetParam("output-search-graph-slf")[0] << "/" << m_lineNumber << ".slf";
+      std::ofstream *file = new std::ofstream;
+      file->open(fileName.str().c_str());
+      if (file->is_open() && file->good()) {
+	ostringstream out;
+	fix(out,PRECISION);
+	manager.OutputSearchGraphAsSLF(m_lineNumber, out);
+	*file << out.str();
+	file -> flush();
+      } else {
+	TRACE_ERR("Cannot output HTK standard lattice for line " << m_lineNumber << " because the output file is not open or not ready for writing" << std::endl);
+      }
+    }
+
+    // Output search graph in hypergraph format for Kenneth Heafield's lazy hypergraph decoder
+    if (m_outputSearchGraphHypergraph) {
+      stringstream fileName;
+      fileName << staticData.GetParam("output-search-graph-hypergraph")[0] << "/" << m_lineNumber;
+      std::ofstream *file = new std::ofstream;
+      file->open(fileName.str().c_str());
+      if (file->is_open() && file->good()) {
+	ostringstream out;
+	fix(out,PRECISION);
+	manager.OutputSearchGraphAsHypergraph(m_lineNumber, out);
+	*file << out.str();
+	file -> flush();
+      } else {
+	TRACE_ERR("Cannot output hypergraph for line " << m_lineNumber << " because the output file is not open or not ready for writing" << std::endl);
+      }
+      file -> close();
+      delete file;
+    }
+
    // apply decision rule and output best translation(s)
    if (m_outputCollector) {
      ostringstream out;
@ -157,7 +197,7 @@ public:
      // MAP decoding: best hypothesis
      const Hypothesis* bestHypo = NULL;
      if (!staticData.UseMBR()) 
-			{
+	  {
        bestHypo = manager.GetBestHypothesis();
        if (bestHypo) {
          if (staticData.IsPathRecoveryEnabled()) {
@ -174,13 +214,18 @@ public:
            staticData.GetOutputFactorOrder(),
            staticData.GetReportSegmentation(),
            staticData.GetReportAllFactors());
+          if (staticData.PrintAlignmentInfo()) {
+        	out << "||| ";
+            OutputAlignment(out, bestHypo);
+          }
+
          OutputAlignment(m_alignmentInfoCollector, m_lineNumber, bestHypo);
          IFVERBOSE(1) {
            debug << "BEST TRANSLATION: " << *bestHypo << endl;
          }
        }
        out << endl;
-			}
+	  }

      // MBR decoding (n-best MBR, lattice MBR, consensus)
      else 
@ -311,6 +356,8 @@ private:
  OutputCollector* m_detailedTranslationCollector;
  OutputCollector* m_alignmentInfoCollector;
  OutputCollector* m_unknownsCollector;
+  bool m_outputSearchGraphSLF;
+  bool m_outputSearchGraphHypergraph;
  std::ofstream *m_alignmentStream;


@ -367,6 +414,63 @@ static void ShowWeights()

 }

+size_t OutputFeatureWeightsForHypergraph(size_t index, const FeatureFunction* ff, std::ostream &outputSearchGraphStream)
+{
+  size_t numScoreComps = ff->GetNumScoreComponents();
+  if (numScoreComps != ScoreProducer::unlimited) {
+    vector<float> values = StaticData::Instance().GetAllWeights().GetScoresForProducer(ff);
+    if (numScoreComps > 1) {
+      for (size_t i = 0; i < numScoreComps; ++i) {
+	outputSearchGraphStream << ff->GetScoreProducerWeightShortName()
+				<< i
+				<< "=" << values[i] << endl;
+      }
+    } else {
+	outputSearchGraphStream << ff->GetScoreProducerWeightShortName()
+				<< "=" << values[0] << endl;
+    }
+    return index+numScoreComps;
+  } else {
+    cerr << "Sparse features are not yet supported when outputting hypergraph format" << endl;
+    assert(false);
+    return 0;
+  }
+}
+
+void OutputFeatureWeightsForHypergraph(std::ostream &outputSearchGraphStream)
+{
+  outputSearchGraphStream.setf(std::ios::fixed);
+  outputSearchGraphStream.precision(6);
+
+  const StaticData& staticData = StaticData::Instance();
+  const TranslationSystem& system = staticData.GetTranslationSystem(TranslationSystem::DEFAULT);
+  const vector<const StatelessFeatureFunction*>& slf =system.GetStatelessFeatureFunctions();
+  const vector<const StatefulFeatureFunction*>& sff = system.GetStatefulFeatureFunctions();
+  size_t featureIndex = 1;
+  for (size_t i = 0; i < sff.size(); ++i) {
+    featureIndex = OutputFeatureWeightsForHypergraph(featureIndex, sff[i], outputSearchGraphStream);
+  }
+  for (size_t i = 0; i < slf.size(); ++i) {
+    if (slf[i]->GetScoreProducerWeightShortName() != "u" &&
+          slf[i]->GetScoreProducerWeightShortName() != "tm" &&
+          slf[i]->GetScoreProducerWeightShortName() != "I" &&
+          slf[i]->GetScoreProducerWeightShortName() != "g")
+    {
+      featureIndex = OutputFeatureWeightsForHypergraph(featureIndex, slf[i], outputSearchGraphStream);
+    }
+  }
+  const vector<PhraseDictionaryFeature*>& pds = system.GetPhraseDictionaries();
+  for( size_t i=0; i<pds.size(); i++ ) {
+    featureIndex = OutputFeatureWeightsForHypergraph(featureIndex, pds[i], outputSearchGraphStream);
+  }
+  const vector<GenerationDictionary*>& gds = system.GetGenerationDictionaries();
+  for( size_t i=0; i<gds.size(); i++ ) {
+    featureIndex = OutputFeatureWeightsForHypergraph(featureIndex, gds[i], outputSearchGraphStream);
+  }
+
+}
+
+
 } //namespace

 /** main function of the command line version of the decoder **/
@ -391,20 +495,20 @@ int main(int argc, char** argv)

    // load all the settings into the Parameter class
    // (stores them as strings, or array of strings)
-    Parameter* params = new Parameter();
-    if (!params->LoadParam(argc,argv)) {
+    Parameter params;
+    if (!params.LoadParam(argc,argv)) {
      exit(1);
    }


    // initialize all "global" variables, which are stored in StaticData
    // note: this also loads models such as the language model, etc.
-    if (!StaticData::LoadDataStatic(params, argv[0])) {
+    if (!StaticData::LoadDataStatic(&params, argv[0])) {
      exit(1);
    }

    // setting "-show-weights" -> just dump out weights and exit
-    if (params->isParamSpecified("show-weights")) {
+    if (params.isParamSpecified("show-weights")) {
      ShowWeights();
      exit(0);
    }
@ -430,6 +534,14 @@ int main(int argc, char** argv)
      TRACE_ERR(weights);
      TRACE_ERR("\n");
    }
+    if (staticData.GetOutputSearchGraphHypergraph() && staticData.GetParam("output-search-graph-hypergraph").size() > 1) {
+      ofstream* weightsOut = ioWrapper->GetOutputSearchGraphHypergraphWeightsStream();
+      OutputFeatureWeightsForHypergraph(*weightsOut);
+      weightsOut->flush();
+      weightsOut->close();
+      delete weightsOut;
+    }
+

    // initialize output streams
    // note: we can't just write to STDOUT or files
@ -533,7 +645,9 @@ int main(int argc, char** argv)
                            searchGraphCollector.get(),
                            detailedTranslationCollector.get(),
                            alignmentInfoCollector.get(),
-                            unknownsCollector.get() );
+                            unknownsCollector.get(),
+			    staticData.GetOutputSearchGraphSLF(),
+			    staticData.GetOutputSearchGraphHypergraph());
      // execute task
 #ifdef WITH_THREADS
    pool.Submit(task);
@ -551,6 +665,8 @@ int main(int argc, char** argv)
    pool.Stop(true); //flush remaining jobs
 #endif

+    delete ioWrapper;
+
  } catch (const std::exception &e) {
    std::cerr << "Exception: " << e.what() << std::endl;
    return EXIT_FAILURE;
--- a/moses/AlignmentInfoCollection.cpp
+++ b/moses/AlignmentInfoCollection.cpp
@ -30,6 +30,9 @@ AlignmentInfoCollection::AlignmentInfoCollection()
  m_emptyAlignmentInfo = Add(pairs);
 }

+AlignmentInfoCollection::~AlignmentInfoCollection()
+{}
+
 const AlignmentInfo &AlignmentInfoCollection::GetEmptyAlignmentInfo() const
 {
  return *m_emptyAlignmentInfo;
--- a/moses/AlignmentInfoCollection.h
+++ b/moses/AlignmentInfoCollection.h
@ -55,6 +55,7 @@ class AlignmentInfoCollection

  //! Only a single static variable should be created.
  AlignmentInfoCollection();
+  ~AlignmentInfoCollection();

  static AlignmentInfoCollection s_instance;

--- a/moses/Hypothesis.cpp
+++ b/moses/Hypothesis.cpp
@ -462,7 +462,7 @@ void Hypothesis::CleanupArcList()
   */
  const StaticData &staticData = StaticData::Instance();
  size_t nBestSize = staticData.GetNBestSize();
-  bool distinctNBest = staticData.GetDistinctNBest() || staticData.UseMBR() || staticData.GetOutputSearchGraph() || staticData.UseLatticeMBR() ;
+  bool distinctNBest = staticData.GetDistinctNBest() || staticData.UseMBR() || staticData.GetOutputSearchGraph() || staticData.GetOutputSearchGraphSLF() || staticData.GetOutputSearchGraphHypergraph() || staticData.UseLatticeMBR() ;

  if (!distinctNBest && m_arcList->size() > nBestSize * 5) {
    // prune arc list only if there too many arcs
--- a/moses/LM/SingleFactor.cpp
+++ b/moses/LM/SingleFactor.cpp
@ -36,8 +36,9 @@ using namespace std;
 namespace Moses
 {

-LanguageModelSingleFactor::~LanguageModelSingleFactor() {}
-
+LanguageModelSingleFactor::~LanguageModelSingleFactor()
+{
+}

 struct PointerState : public FFState {
  const void* lmstate;
@ -58,7 +59,11 @@ LanguageModelPointerState::LanguageModelPointerState()
  m_beginSentenceState = new PointerState(NULL);
 }

-LanguageModelPointerState::~LanguageModelPointerState() {}
+LanguageModelPointerState::~LanguageModelPointerState()
+{
+  delete m_nullContextState;
+  delete m_beginSentenceState;
+}

 const FFState *LanguageModelPointerState::GetNullContextState() const
 {
--- a/moses/Manager.cpp
+++ b/moses/Manager.cpp
@ -26,8 +26,10 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 #endif

 #include <algorithm>
-#include <limits>
 #include <cmath>
+#include <limits>
+#include <map>
+#include <set>
 #include "Manager.h"
 #include "TypeDef.h"
 #include "Util.h"
@ -46,17 +48,19 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 #include "rule.pb.h"
 #endif

+#include "util/exception.hh"
+
 using namespace std;

 namespace Moses
 {
 Manager::Manager(size_t lineNumber, InputType const& source, SearchAlgorithm searchAlgorithm, const TranslationSystem* system)
-  :m_lineNumber(lineNumber)
-  ,m_system(system)
+  :m_system(system)
  ,m_transOptColl(source.CreateTranslationOptionCollection(system))
  ,m_search(Search::CreateSearch(*this, source, searchAlgorithm, *m_transOptColl))
  ,interrupted_flag(0)
  ,m_hypoId(0)
+  ,m_lineNumber(lineNumber)
  ,m_source(source)
 {
  m_system->InitializeBeforeSentenceProcessing(source);
@ -628,6 +632,420 @@ void Manager::GetSearchGraph(vector<SearchGraphNode>& searchGraph) const

 }

+void Manager::OutputFeatureWeightsForSLF(std::ostream &outputSearchGraphStream) const
+{
+  outputSearchGraphStream.setf(std::ios::fixed);
+  outputSearchGraphStream.precision(6);
+
+  const StaticData& staticData = StaticData::Instance();
+  const TranslationSystem& system = staticData.GetTranslationSystem(TranslationSystem::DEFAULT);
+  const vector<const StatelessFeatureFunction*>& slf =system.GetStatelessFeatureFunctions();
+  const vector<const StatefulFeatureFunction*>& sff = system.GetStatefulFeatureFunctions();
+  size_t featureIndex = 1;
+  for (size_t i = 0; i < sff.size(); ++i) {
+    featureIndex = OutputFeatureWeightsForSLF(featureIndex, sff[i], outputSearchGraphStream);
+  }
+  for (size_t i = 0; i < slf.size(); ++i) {
+    if (slf[i]->GetScoreProducerWeightShortName() != "u" &&
+          slf[i]->GetScoreProducerWeightShortName() != "tm" &&
+          slf[i]->GetScoreProducerWeightShortName() != "I" &&
+          slf[i]->GetScoreProducerWeightShortName() != "g")
+    {
+      featureIndex = OutputFeatureWeightsForSLF(featureIndex, slf[i], outputSearchGraphStream);
+    }
+  }
+  const vector<PhraseDictionaryFeature*>& pds = system.GetPhraseDictionaries();
+  for( size_t i=0; i<pds.size(); i++ ) {
+    featureIndex = OutputFeatureWeightsForSLF(featureIndex, pds[i], outputSearchGraphStream);
+  }
+  const vector<GenerationDictionary*>& gds = system.GetGenerationDictionaries();
+  for( size_t i=0; i<gds.size(); i++ ) {
+    featureIndex = OutputFeatureWeightsForSLF(featureIndex, gds[i], outputSearchGraphStream);
+  }
+
+}
+
+void Manager::OutputFeatureValuesForSLF(const Hypothesis* hypo, bool zeros, std::ostream &outputSearchGraphStream) const
+{
+  outputSearchGraphStream.setf(std::ios::fixed);
+  outputSearchGraphStream.precision(6);
+
+  // outputSearchGraphStream << endl;
+  // outputSearchGraphStream << (*hypo) << endl;
+  // const ScoreComponentCollection& scoreCollection = hypo->GetScoreBreakdown(); 
+  // outputSearchGraphStream << scoreCollection << endl;
+
+  const StaticData& staticData = StaticData::Instance();
+  const TranslationSystem& system = staticData.GetTranslationSystem(TranslationSystem::DEFAULT);
+  const vector<const StatelessFeatureFunction*>& slf =system.GetStatelessFeatureFunctions();
+  const vector<const StatefulFeatureFunction*>& sff = system.GetStatefulFeatureFunctions();
+  size_t featureIndex = 1;
+  for (size_t i = 0; i < sff.size(); ++i) {
+    featureIndex = OutputFeatureValuesForSLF(featureIndex, zeros, hypo, sff[i], outputSearchGraphStream);
+  }
+  for (size_t i = 0; i < slf.size(); ++i) {
+    if (slf[i]->GetScoreProducerWeightShortName() != "u" &&
+          slf[i]->GetScoreProducerWeightShortName() != "tm" &&
+          slf[i]->GetScoreProducerWeightShortName() != "I" &&
+          slf[i]->GetScoreProducerWeightShortName() != "g")
+    {
+      featureIndex = OutputFeatureValuesForSLF(featureIndex, zeros, hypo, slf[i], outputSearchGraphStream);
+    }
+  }
+  const vector<PhraseDictionaryFeature*>& pds = system.GetPhraseDictionaries();
+  for( size_t i=0; i<pds.size(); i++ ) {
+    featureIndex = OutputFeatureValuesForSLF(featureIndex, zeros, hypo, pds[i], outputSearchGraphStream);
+  }
+  const vector<GenerationDictionary*>& gds = system.GetGenerationDictionaries();
+  for( size_t i=0; i<gds.size(); i++ ) {
+    featureIndex = OutputFeatureValuesForSLF(featureIndex, zeros, hypo, gds[i], outputSearchGraphStream);
+  }
+
+}
+
+void Manager::OutputFeatureValuesForHypergraph(const Hypothesis* hypo, std::ostream &outputSearchGraphStream) const
+{
+  outputSearchGraphStream.setf(std::ios::fixed);
+  outputSearchGraphStream.precision(6);
+
+  const StaticData& staticData = StaticData::Instance();
+  const TranslationSystem& system = staticData.GetTranslationSystem(TranslationSystem::DEFAULT);
+  const vector<const StatelessFeatureFunction*>& slf =system.GetStatelessFeatureFunctions();
+  const vector<const StatefulFeatureFunction*>& sff = system.GetStatefulFeatureFunctions();
+  size_t featureIndex = 1;
+  for (size_t i = 0; i < sff.size(); ++i) {
+    featureIndex = OutputFeatureValuesForHypergraph(featureIndex, hypo, sff[i], outputSearchGraphStream);
+  }
+  for (size_t i = 0; i < slf.size(); ++i) {
+    if (slf[i]->GetScoreProducerWeightShortName() != "u" &&
+          slf[i]->GetScoreProducerWeightShortName() != "tm" &&
+          slf[i]->GetScoreProducerWeightShortName() != "I" &&
+          slf[i]->GetScoreProducerWeightShortName() != "g")
+    {
+      featureIndex = OutputFeatureValuesForHypergraph(featureIndex, hypo, slf[i], outputSearchGraphStream);
+    }
+  }
+  const vector<PhraseDictionaryFeature*>& pds = system.GetPhraseDictionaries();
+  for( size_t i=0; i<pds.size(); i++ ) {
+    featureIndex = OutputFeatureValuesForHypergraph(featureIndex, hypo, pds[i], outputSearchGraphStream);
+  }
+  const vector<GenerationDictionary*>& gds = system.GetGenerationDictionaries();
+  for( size_t i=0; i<gds.size(); i++ ) {
+    featureIndex = OutputFeatureValuesForHypergraph(featureIndex, hypo, gds[i], outputSearchGraphStream);
+  }
+
+}
+
+
+size_t Manager::OutputFeatureWeightsForSLF(size_t index, const FeatureFunction* ff, std::ostream &outputSearchGraphStream) const
+{
+  size_t numScoreComps = ff->GetNumScoreComponents();
+  if (numScoreComps != ScoreProducer::unlimited) {
+    vector<float> values = StaticData::Instance().GetAllWeights().GetScoresForProducer(ff);
+    for (size_t i = 0; i < numScoreComps; ++i) {
+      outputSearchGraphStream << "# " << ff->GetScoreProducerDescription() 
+			      << " "  << ff->GetScoreProducerWeightShortName()
+			      << " "  << (i+1) << " of " << numScoreComps << endl
+			      << "x"  << (index+i) << "scale=" << values[i] << endl;
+    }
+    return index+numScoreComps;
+  } else {
+    cerr << "Sparse features are not supported when outputting HTK standard lattice format" << endl;
+    assert(false);
+    return 0;
+  }
+}
+
+size_t Manager::OutputFeatureValuesForSLF(size_t index, bool zeros, const Hypothesis* hypo, const FeatureFunction* ff, std::ostream &outputSearchGraphStream) const
+{
+
+  // { const FeatureFunction* sp = ff;
+  //   const FVector& m_scores = scoreCollection.GetScoresVector();
+  //   FVector& scores = const_cast<FVector&>(m_scores);
+  //   std::string prefix = sp->GetScoreProducerDescription() + FName::SEP;
+  //   // std::cout << "prefix==" << prefix << endl;
+  //   // cout << "m_scores==" << m_scores << endl;
+  //   // cout << "m_scores.size()==" << m_scores.size() << endl;
+  //   // cout << "m_scores.coreSize()==" << m_scores.coreSize() << endl;
+  //   // cout << "m_scores.cbegin() ?= m_scores.cend()\t" <<  (m_scores.cbegin() == m_scores.cend()) << endl;
+
+    
+  //   // for(FVector::FNVmap::const_iterator i = m_scores.cbegin(); i != m_scores.cend(); i++) {
+  //   //   std::cout<<prefix << "\t" << (i->first) << "\t" << (i->second) << std::endl;
+  //   // }
+  //   for(int i=0, n=v.size(); i<n; i+=1) {
+  //     //      outputSearchGraphStream << prefix << i << "==" << v[i] << std::endl;
+      
+  //   }
+  // }
+
+  // FVector featureValues = scoreCollection.GetVectorForProducer(ff);
+  // outputSearchGraphStream << featureValues << endl;
+  const ScoreComponentCollection& scoreCollection = hypo->GetScoreBreakdown(); 
+
+  vector<float> featureValues = scoreCollection.GetScoresForProducer(ff);
+  size_t numScoreComps = featureValues.size();//featureValues.coreSize();
+  //  if (numScoreComps != ScoreProducer::unlimited) {
+    // vector<float> values = StaticData::Instance().GetAllWeights().GetScoresForProducer(ff);
+  for (size_t i = 0; i < numScoreComps; ++i) {
+    outputSearchGraphStream << "x"  << (index+i) << "=" << ((zeros) ? 0.0 : featureValues[i]) << " ";
+    }
+    return index+numScoreComps;
+  // } else {
+  //   cerr << "Sparse features are not supported when outputting HTK standard lattice format" << endl;
+  //   assert(false);
+  //   return 0;
+  // }
+}
+
+size_t Manager::OutputFeatureValuesForHypergraph(size_t index, const Hypothesis* hypo, const FeatureFunction* ff, std::ostream &outputSearchGraphStream) const
+{
+
+  ScoreComponentCollection scoreCollection = hypo->GetScoreBreakdown(); 
+  const Hypothesis *prevHypo = hypo->GetPrevHypo();
+  if (prevHypo) {
+    scoreCollection.MinusEquals( prevHypo->GetScoreBreakdown() );
+  }
+  vector<float> featureValues = scoreCollection.GetScoresForProducer(ff);
+  size_t numScoreComps = featureValues.size();
+
+  if (numScoreComps > 1) {
+    for (size_t i = 0; i < numScoreComps; ++i) {
+      outputSearchGraphStream << ff->GetScoreProducerWeightShortName()  << i << "=" << featureValues[i] << " ";
+    }
+  } else {
+    outputSearchGraphStream << ff->GetScoreProducerWeightShortName()  << "=" << featureValues[0] << " ";
+  }
+
+  return index+numScoreComps;
+}
+
+/**! Output search graph in hypergraph format of Kenneth Heafield's lazy hypergraph decoder */
+void Manager::OutputSearchGraphAsHypergraph(long translationId, std::ostream &outputSearchGraphStream) const
+{
+  vector<SearchGraphNode> searchGraph;
+  GetSearchGraph(searchGraph);
+
+  map<int,int> mosesIDToHypergraphID;
+  // map<int,int> hypergraphIDToMosesID;
+  set<int> terminalNodes;
+  multimap<int,int> hypergraphIDToArcs;
+
+  long numNodes = 0;
+  long endNode = 0;
+  {
+    long hypergraphHypothesisID = 0;
+    for (size_t arcNumber = 0, size=searchGraph.size(); arcNumber < size; ++arcNumber) {
+    
+      // Get an id number for the previous hypothesis
+      const Hypothesis *prevHypo = searchGraph[arcNumber].hypo->GetPrevHypo();
+      if (prevHypo!=NULL) {
+	int mosesPrevHypothesisID = prevHypo->GetId();
+	if (mosesIDToHypergraphID.count(mosesPrevHypothesisID) == 0) {
+	  mosesIDToHypergraphID[mosesPrevHypothesisID] = hypergraphHypothesisID;
+	  //	hypergraphIDToMosesID[hypergraphHypothesisID] = mosesPrevHypothesisID;
+	  hypergraphHypothesisID += 1;
+	}
+      }
+
+      // Get an id number for this hypothesis
+      int mosesHypothesisID;
+      if (searchGraph[arcNumber].recombinationHypo) {
+	mosesHypothesisID = searchGraph[arcNumber].recombinationHypo->GetId();
+      } else {
+	mosesHypothesisID = searchGraph[arcNumber].hypo->GetId();
+      }
+
+      if (mosesIDToHypergraphID.count(mosesHypothesisID) == 0) {
+      
+	mosesIDToHypergraphID[mosesHypothesisID] = hypergraphHypothesisID;
+	//      hypergraphIDToMosesID[hypergraphHypothesisID] = mosesHypothesisID;
+
+	bool terminalNode = (searchGraph[arcNumber].forward == -1);
+	if (terminalNode) {
+	  // Final arc to end node, representing the end of the sentence </s>
+	  terminalNodes.insert(hypergraphHypothesisID);
+	}
+
+	hypergraphHypothesisID += 1;
+      }
+
+      // Record that this arc ends at this node
+      hypergraphIDToArcs.insert(pair<int,int>(mosesIDToHypergraphID[mosesHypothesisID],arcNumber));
+
+    }
+    
+    // Unique end node
+    endNode = hypergraphHypothesisID;
+    //    mosesIDToHypergraphID[hypergraphHypothesisID] = hypergraphHypothesisID;
+    numNodes = endNode + 1;
+
+  }
+  
+
+  long numArcs = searchGraph.size() + terminalNodes.size();
+
+  // Print number of nodes and arcs
+  outputSearchGraphStream << numNodes << " " << numArcs << endl;
+
+  for (int hypergraphHypothesisID=0; hypergraphHypothesisID < endNode; hypergraphHypothesisID+=1) {
+    //    int mosesID = hypergraphIDToMosesID[hypergraphHypothesisID];
+    size_t count = hypergraphIDToArcs.count(hypergraphHypothesisID);
+    if (count > 0) {
+      outputSearchGraphStream << count << endl;
+
+      pair<multimap<int,int>::iterator, multimap<int,int>::iterator> range =
+	hypergraphIDToArcs.equal_range(hypergraphHypothesisID);
+      for (multimap<int,int>::iterator it=range.first; it!=range.second; ++it) {
+	int lineNumber = (*it).second;
+	const Hypothesis *thisHypo = searchGraph[lineNumber].hypo;
+	int mosesHypothesisID;// = thisHypo->GetId();
+	if (searchGraph[lineNumber].recombinationHypo) {
+	  mosesHypothesisID = searchGraph[lineNumber].recombinationHypo->GetId();
+	} else {
+	  mosesHypothesisID = searchGraph[lineNumber].hypo->GetId();
+	}
+	//	int actualHypergraphHypothesisID = mosesIDToHypergraphID[mosesHypothesisID];
+	UTIL_THROW_IF(
+		      (hypergraphHypothesisID != mosesIDToHypergraphID[mosesHypothesisID]),
+		      util::Exception,
+		      "Error while writing search lattice as hypergraph for sentence " << translationId << ". " <<
+		      "Moses node " << mosesHypothesisID << " was expected to have hypergraph id " << hypergraphHypothesisID <<
+		      ", but actually had hypergraph id " << mosesIDToHypergraphID[mosesHypothesisID] << 
+		      ". There are " << numNodes << " nodes in the search lattice."
+		      );
+
+	const Hypothesis *prevHypo = thisHypo->GetPrevHypo();
+	if (prevHypo==NULL) {
+	  outputSearchGraphStream << "<s> ||| " << endl;
+	} else {
+	  int startNode = mosesIDToHypergraphID[prevHypo->GetId()];
+
+	  UTIL_THROW_IF(
+			(startNode >= hypergraphHypothesisID),
+			util::Exception,
+			"Error while writing search lattice as hypergraph for sentence" << translationId << ". " <<
+			"The nodes must be output in topological order. The code attempted to violate this restriction."
+			);
+
+	  const TargetPhrase &targetPhrase = thisHypo->GetCurrTargetPhrase();
+	  int targetWordCount = targetPhrase.GetSize();
+
+	  outputSearchGraphStream << "[" << startNode << "]";
+	  for (int targetWordIndex=0; targetWordIndex<targetWordCount; targetWordIndex+=1) {
+	    outputSearchGraphStream << " " << targetPhrase.GetWord(targetWordIndex);
+	  }
+	  outputSearchGraphStream << " ||| ";
+	  OutputFeatureValuesForHypergraph(thisHypo, outputSearchGraphStream);
+	  outputSearchGraphStream << endl;
+	}
+
+      }
+    }
+  }
+
+  // Print node and arc(s) for end of sentence </s>
+  outputSearchGraphStream << terminalNodes.size() << endl;
+  for (set<int>::iterator it=terminalNodes.begin(); it!=terminalNodes.end(); ++it) {
+    outputSearchGraphStream << "[" << (*it) << "] </s> ||| " << endl;
+  }
+
+}
+
+
+/**! Output search graph in HTK standard lattice format (SLF) */
+void Manager::OutputSearchGraphAsSLF(long translationId, std::ostream &outputSearchGraphStream) const
+{
+
+  vector<SearchGraphNode> searchGraph;
+  GetSearchGraph(searchGraph);
+
+  long numArcs = 0;
+  long numNodes = 0;
+
+  map<int,int> nodes;
+  set<int> terminalNodes;
+
+  // Unique start node
+  nodes[0] = 0;
+
+  for (size_t arcNumber = 0; arcNumber < searchGraph.size(); ++arcNumber) {
+
+    int targetWordCount = searchGraph[arcNumber].hypo->GetCurrTargetPhrase().GetSize();
+    numArcs += targetWordCount;
+
+    int hypothesisID = searchGraph[arcNumber].hypo->GetId();
+    if (nodes.count(hypothesisID) == 0) {
+      
+      numNodes += targetWordCount;
+      nodes[hypothesisID] = numNodes;
+      //numNodes += 1;
+
+      bool terminalNode = (searchGraph[arcNumber].forward == -1);
+      if (terminalNode) {
+	numArcs += 1;
+      }
+    }
+
+  }
+  numNodes += 1;
+
+  // Unique end node
+  nodes[numNodes] = numNodes;
+
+  outputSearchGraphStream << "UTTERANCE=Sentence_" << translationId << endl;
+  outputSearchGraphStream << "VERSION=1.1" << endl;
+  outputSearchGraphStream << "base=2.71828182845905" << endl;
+  outputSearchGraphStream << "NODES=" << (numNodes+1) << endl;
+  outputSearchGraphStream << "LINKS=" << numArcs  << endl;
+
+  OutputFeatureWeightsForSLF(outputSearchGraphStream);
+
+  for (size_t arcNumber = 0, lineNumber = 0; lineNumber < searchGraph.size(); ++lineNumber) {
+    const Hypothesis *thisHypo = searchGraph[lineNumber].hypo;
+    const Hypothesis *prevHypo = thisHypo->GetPrevHypo();
+    if (prevHypo) {
+
+      int startNode = nodes[prevHypo->GetId()];
+      int endNode   = nodes[thisHypo->GetId()];
+      bool terminalNode = (searchGraph[lineNumber].forward == -1);
+      const TargetPhrase &targetPhrase = thisHypo->GetCurrTargetPhrase();
+      int targetWordCount = targetPhrase.GetSize();
+
+      for (int targetWordIndex=0; targetWordIndex<targetWordCount; targetWordIndex+=1) {
+	int x = (targetWordCount-targetWordIndex);
+
+	outputSearchGraphStream <<  "J=" << arcNumber;
+
+	if (targetWordIndex==0) {
+	  outputSearchGraphStream << " S=" << startNode;
+	} else {
+	  outputSearchGraphStream << " S=" << endNode - x;
+	}
+
+	outputSearchGraphStream << " E=" << endNode - (x-1)
+				<< " W=" << targetPhrase.GetWord(targetWordIndex);
+
+	OutputFeatureValuesForSLF(thisHypo, (targetWordIndex>0), outputSearchGraphStream);
+
+	outputSearchGraphStream  << endl;
+
+	arcNumber += 1;
+      }
+
+      if (terminalNode && terminalNodes.count(endNode) == 0) {
+	terminalNodes.insert(endNode);
+	outputSearchGraphStream <<  "J="   << arcNumber 
+				<< " S="   << endNode
+				<< " E="   << numNodes
+				<< endl;
+	arcNumber += 1;
+      }
+    }	    
+  }
+
+}
+
 void OutputSearchNode(long translationId, std::ostream &outputSearchGraphStream,
                      const SearchGraphNode& searchNode)
 {
--- a/moses/Manager.h
+++ b/moses/Manager.h
@ -93,6 +93,19 @@ class Manager
  Manager(Manager const&);
  void operator=(Manager const&);
  const TranslationSystem* m_system;
+private:
+
+  // Helper functions to output search graph in HTK standard lattice format
+  void OutputFeatureWeightsForSLF(std::ostream &outputSearchGraphStream) const;
+  size_t OutputFeatureWeightsForSLF(size_t index, const FeatureFunction* ff, std::ostream &outputSearchGraphStream) const;
+  void OutputFeatureValuesForSLF(const Hypothesis* hypo, bool zeros, std::ostream &outputSearchGraphStream) const;
+  size_t OutputFeatureValuesForSLF(size_t index, bool zeros, const Hypothesis* hypo, const FeatureFunction* ff, std::ostream &outputSearchGraphStream) const;
+
+  // Helper functions to output search graph in the hypergraph format of Kenneth Heafield's lazy hypergraph decoder
+  void OutputFeatureValuesForHypergraph(const Hypothesis* hypo, std::ostream &outputSearchGraphStream) const;
+  size_t OutputFeatureValuesForHypergraph(size_t index, const Hypothesis* hypo, const FeatureFunction* ff, std::ostream &outputSearchGraphStream) const;
+  
+
 protected:
  // data
 //	InputType const& m_source; /**< source sentence to be translated */
@ -103,6 +116,7 @@ protected:
  size_t interrupted_flag;
  std::auto_ptr<SentenceStats> m_sentenceStats;
  int m_hypoId; //used to number the hypos as they are created.
+  size_t m_lineNumber;

  void GetConnectedGraph(
    std::map< int, bool >* pConnected,
@ -113,7 +127,6 @@ protected:


 public:
-  size_t m_lineNumber;
  InputType const& m_source; /**< source sentence to be translated */
  Manager(size_t lineNumber, InputType const& source, SearchAlgorithm searchAlgorithm, const TranslationSystem* system);
  ~Manager();
@ -137,6 +150,8 @@ public:
 #endif

  void OutputSearchGraph(long translationId, std::ostream &outputSearchGraphStream) const;
+  void OutputSearchGraphAsSLF(long translationId, std::ostream &outputSearchGraphStream) const;
+  void OutputSearchGraphAsHypergraph(long translationId, std::ostream &outputSearchGraphStream) const;
  void GetSearchGraph(std::vector<SearchGraphNode>& searchGraph) const;
  const InputType& GetSource() const {
    return m_source;
--- a/moses/Parameter.cpp
+++ b/moses/Parameter.cpp
@ -130,6 +130,8 @@ Parameter::Parameter()
  AddParam("output-search-graph", "osg", "Output connected hypotheses of search into specified filename");
  AddParam("output-search-graph-extended", "osgx", "Output connected hypotheses of search into specified filename, in extended format");
  AddParam("unpruned-search-graph", "usg", "When outputting chart search graph, do not exclude dead ends. Note: stack pruning may have eliminated some hypotheses");
+  AddParam("output-search-graph-slf", "slf", "Output connected hypotheses of search into specified directory, one file per sentence, in HTK standard lattice format (SLF)");
+  AddParam("output-search-graph-hypergraph", "Output connected hypotheses of search into specified directory, one file per sentence, in a hypergraph format (see Kenneth Heafield's lazy hypergraph decoder)");
  AddParam("include-lhs-in-search-graph", "lhssg", "When outputting chart search graph, include the label of the LHS of the rule (useful when using syntax)");
 #ifdef HAVE_PROTOBUF
  AddParam("output-search-graph-pb", "pb", "Write phrase lattice to protocol buffer objects in the specified path.");
@ -177,6 +179,7 @@ Parameter::Parameter()
  AddParam("minlexr-memory", "Load lexical reordering table in minlexr format into memory");                                          
  AddParam("minphr-memory", "Load phrase table in minphr format into memory");

+  AddParam("print-alignment-info", "Output word-to-word alignment into the log file. Word-to-word alignments are takne from the phrase table if any. Default is false");
  AddParam("include-segmentation-in-n-best", "include phrasal segmentation in the n-best list. default is false");
  AddParam("print-alignment-info-in-n-best", "Include word-to-word alignment in the n-best list. Word-to-word alignments are takne from the phrase table if any. Default is false");
  AddParam("alignment-output-file", "print output word alignments into given file");
--- a/moses/StaticData.cpp
+++ b/moses/StaticData.cpp
@ -162,10 +162,6 @@ bool StaticData::LoadData(Parameter *parameter)
    }
  }

-  if(m_parameter->GetParam("sort-word-alignment").size()) {
-    m_wordAlignmentSort = (WordAlignmentSort) Scan<size_t>(m_parameter->GetParam("sort-word-alignment")[0]);
-  }
-  
  // factor delimiter
  if (m_parameter->GetParam("factor-delimiter").size() > 0) {
    m_factorDelimiter = m_parameter->GetParam("factor-delimiter")[0];
@ -175,6 +171,16 @@ bool StaticData::LoadData(Parameter *parameter)
  SetBooleanParameter( &m_outputHypoScore, "output-hypo-score", false );

  //word-to-word alignment
+  // alignments
+  SetBooleanParameter( &m_PrintAlignmentInfo, "print-alignment-info", false );
+  if (m_PrintAlignmentInfo) {
+    m_needAlignmentInfo = true;
+  }
+
+  if(m_parameter->GetParam("sort-word-alignment").size()) {
+    m_wordAlignmentSort = (WordAlignmentSort) Scan<size_t>(m_parameter->GetParam("sort-word-alignment")[0]);
+  }
+
  SetBooleanParameter( &m_PrintAlignmentInfoNbest, "print-alignment-info-in-n-best", false );
  if (m_PrintAlignmentInfoNbest) {
    m_needAlignmentInfo = true;
@ -235,8 +241,19 @@ bool StaticData::LoadData(Parameter *parameter)
    }
    m_outputSearchGraph = true;
    m_outputSearchGraphExtended = true;
-  } else
+  } else {
    m_outputSearchGraph = false;
+  }
+  if (m_parameter->GetParam("output-search-graph-slf").size() > 0) {
+    m_outputSearchGraphSLF = true;
+  } else {
+    m_outputSearchGraphSLF = false;
+  }
+  if (m_parameter->GetParam("output-search-graph-hypergraph").size() > 0) {
+    m_outputSearchGraphHypergraph = true;
+  } else {
+    m_outputSearchGraphHypergraph = false;
+  }
 #ifdef HAVE_PROTOBUF
  if (m_parameter->GetParam("output-search-graph-pb").size() > 0) {
    if (m_parameter->GetParam("output-search-graph-pb").size() != 1) {
--- a/moses/StaticData.h
+++ b/moses/StaticData.h
@ -171,6 +171,7 @@ protected:
  bool m_reportAllFactorsNBest;
  std::string m_detailedTranslationReportingFilePath;
  bool m_onlyDistinctNBest;
+  bool m_PrintAlignmentInfo;
  bool m_needAlignmentInfo;
  bool m_PrintAlignmentInfoNbest;

@ -216,6 +217,8 @@ protected:
  bool m_outputWordGraph; //! whether to output word graph
  bool m_outputSearchGraph; //! whether to output search graph
  bool m_outputSearchGraphExtended; //! ... in extended format
+  bool m_outputSearchGraphSLF; //! whether to output search graph in HTK standard lattice format (SLF)
+  bool m_outputSearchGraphHypergraph; //! whether to output search graph in hypergraph
 #ifdef HAVE_PROTOBUF
  bool m_outputSearchGraphPB; //! whether to output search graph as a protobuf
 #endif
@ -458,7 +461,7 @@ public:
    return m_nBestFilePath;
  }
  bool IsNBestEnabled() const {
-    return (!m_nBestFilePath.empty()) || m_mbr || m_useLatticeMBR || m_mira || m_outputSearchGraph || m_useConsensusDecoding || !m_latticeSamplesFilePath.empty()
+    return (!m_nBestFilePath.empty()) || m_mbr || m_useLatticeMBR || m_mira || m_outputSearchGraph || m_outputSearchGraphSLF || m_outputSearchGraphHypergraph || m_useConsensusDecoding || !m_latticeSamplesFilePath.empty()
 #ifdef HAVE_PROTOBUF
           || m_outputSearchGraphPB
 #endif
@ -631,6 +634,12 @@ public:
  bool GetOutputSearchGraphExtended() const {
    return m_outputSearchGraphExtended;
  }
+  bool GetOutputSearchGraphSLF() const {
+    return m_outputSearchGraphSLF;
+  }
+  bool GetOutputSearchGraphHypergraph() const {
+    return m_outputSearchGraphHypergraph;
+  }
 #ifdef HAVE_PROTOBUF
  bool GetOutputSearchGraphPB() const {
    return m_outputSearchGraphPB;
@ -722,6 +731,9 @@ public:
  const std::string &GetAlignmentOutputFile() const {
    return m_alignmentOutputFile;
  }
+  bool PrintAlignmentInfo() const {
+    return m_PrintAlignmentInfo;
+  }
  bool PrintAlignmentInfoInNbest() const {
    return m_PrintAlignmentInfoNbest;
  }
--- a/phrase-extract/consolidate-main.cpp
+++ b/phrase-extract/consolidate-main.cpp
@ -256,7 +256,7 @@ void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameC
   if (kneserNeyFlag) {
     float D = kneserNey_D3;
     if (countEF < 2) D = kneserNey_D1;
-     if (countEF < 3) D = kneserNey_D2;
+     else if (countEF < 3) D = kneserNey_D2;
     if (D > countEF) D = countEF - 0.01; // sanity constraint

     float p_b_E = n1_E / totalCount; // target phrase prob based on distinct
--- a/phrase-extract/extract-main.cpp
+++ b/phrase-extract/extract-main.cpp
@ -712,6 +712,10 @@ for(int fi=startF; fi<=endF; fi++) {
  if (m_options.isOrientationFlag())
    outextractstrOrientation << orientationInfo;

+  if (m_options.isIncludeSentenceIdFlag()) {
+    outextractstr << " ||| " << sentence.sentenceID;
+  }
+
  if (m_options.getInstanceWeightsFile().length()) {
    if (m_options.isTranslationFlag()) {
      outextractstr << " ||| " << sentence.weightString;
@ -722,9 +726,6 @@ for(int fi=startF; fi<=endF; fi++) {
    }
  }

-  if (m_options.isIncludeSentenceIdFlag()) {
-    outextractstr << " ||| " << sentence.sentenceID;
-  }

  if (m_options.isTranslationFlag()) outextractstr << "\n";
  if (m_options.isTranslationFlag()) outextractstrInv << "\n";
--- a/scripts/ems/support/wrap-xml.perl
+++ b/scripts/ems/support/wrap-xml.perl
@ -13,10 +13,10 @@ chomp(@OUT);
 while(<SRC>) {
    chomp;
    if (/^<srcset/) {
-	s/<srcset/<tstset trglang="$language"/;
+	s/<srcset/<tstset trglang="$language"/i;
    }
    elsif (/^<\/srcset/) {
-	s/<\/srcset/<\/tstset/;
+	s/<\/srcset/<\/tstset/i;
    }
    elsif (/^<doc/i) {
  s/ *sysid="[^\"]+"//;
@ -26,10 +26,10 @@ while(<SRC>) {
 	my $line = shift(@OUT);
        $line = "" if $line =~ /NO BEST TRANSLATION/;
        if (/<\/seg>/) {
-	  s/(<seg[^>]+> *).*(<\/seg>)/$1$line$2/;
+	  s/(<seg[^>]+> *).*(<\/seg>)/$1$line$2/i;
        }
        else {
-	  s/(<seg[^>]+> *)[^<]*/$1$line/;
+	  s/(<seg[^>]+> *)[^<]*/$1$line/i;
        }
    }
    print $_."\n";
--- a/scripts/generic/compound-splitter.perl
+++ b/scripts/generic/compound-splitter.perl
@ -16,15 +16,15 @@ $HELP = 1
    unless &GetOptions('corpus=s' => \$CORPUS,
 		       'model=s' => \$MODEL,
 		       'filler=s' => \$FILLER,
-           'factored' => \$FACTORED,
+		       'factored' => \$FACTORED,
 		       'min-size=i' => \$MIN_SIZE,
 		       'min-count=i' => \$MIN_COUNT,
 		       'max-count=i' => \$MAX_COUNT,
 		       'help' => \$HELP,
 		       'verbose' => \$VERBOSE,
-           'syntax' => \$SYNTAX,
-           'binarize' => \$BINARIZE,
-           'mark-split' => \$MARK_SPLIT,
+		       'syntax' => \$SYNTAX,
+		       'binarize' => \$BINARIZE,
+		       'mark-split' => \$MARK_SPLIT,
 		       'train' => \$TRAIN);

 if ($HELP ||
@ -155,34 +155,37 @@ sub apply {
        next if defined($COUNT{$lc}) && $COUNT{$lc} > $count;
 	$COUNT{$lc} = $count;
 	$TRUECASE{$lc} = $factored_word;
-  $LABEL{$lc} = $label if $SYNTAX;
+	$LABEL{$lc} = $label if $SYNTAX;
    }
    close(MODEL);

    while(<STDIN>) {
 	my $first = 1;
 	chop; s/\s+/ /g; s/^ //; s/ $//;
-  my @BUFFER; # for xml tags
+	my @BUFFER; # for xml tags
 	foreach my $factored_word (split) {
 	    print " " unless $first;	    
 	    $first = 0;

-      # syntax: don't split xml
-      if ($SYNTAX && ($factored_word =~ /^</ || $factored_word =~ />$/)) {
-        push @BUFFER,$factored_word;
-        $first = 1;
-        next;
-      }
-
-      # get case class
-      my $word = $factored_word;
-      $word =~ s/\|.+//g; # just first factor
-      my $lc = lc($word);
-
+	    # syntax: don't split xml
+	    if ($SYNTAX && ($factored_word =~ /^</ || $factored_word =~ />$/)) {
+		push @BUFFER,$factored_word;
+		$first = 1;
+		next;
+	    }
+	    
+	    # get case class
+	    my $word = $factored_word;
+	    $word =~ s/\|.+//g; # just first factor
+	    my $lc = lc($word);
+	    
+	    print STDERR "considering $word ($lc)...\n" if $VERBOSE;
 	    # don't split frequent words
-	    if (defined($COUNT{$lc}) && $COUNT{$lc}>=$MAX_COUNT) {
-    print join(" ",@BUFFER)." " if scalar(@BUFFER); @BUFFER = (); # clear buffer
+	    if ((defined($COUNT{$lc}) && $COUNT{$lc}>=$MAX_COUNT) ||
+	        $lc !~ /[a-zA-Z]/) {; # has to have at least one letter
+		print join(" ",@BUFFER)." " if scalar(@BUFFER); @BUFFER = (); # clear buffer
 		print $factored_word;
+		print STDERR "\tfrequent word ($COUNT{$lc}>=$MAX_COUNT), skipping\n" if $VERBOSE;
 		next;
 	    }

--- a/scripts/generic/mteval-v13a.pl
+++ b/scripts/generic/mteval-v13a.pl
@ -1009,7 +1009,7 @@ sub extract_sgml_tag_and_span
 sub extract_sgml_tag_attribute
 {
 	my ($name, $data) = @_;
-	($data =~ m|$name\s*=\s*\"([^\"]*)\"|si) ? ($1) : ();
+	($data =~ m|$name\s*=\s*\"?([^\"]*)\"?|si) ? ($1) : ();
 }

 #################################
--- a/scripts/recaser/detruecase.perl
+++ b/scripts/recaser/detruecase.perl
@ -6,11 +6,12 @@ use Getopt::Long "GetOptions";
 binmode(STDIN, ":utf8");
 binmode(STDOUT, ":utf8");

-
-my ($SRC,$INFILE);
+my ($SRC,$INFILE,$UNBUFFERED);
 die("detruecase.perl < in > out")
    unless &GetOptions('headline=s' => \$SRC,
-		       'in=s' => \$INFILE);
+		       'in=s' => \$INFILE,
+                       'b|unbuffered' => \$UNBUFFERED);
+if (defined($UNBUFFERED) && $UNBUFFERED) { $|=1; }

 my %SENTENCE_END = ("."=>1,":"=>1,"?"=>1,"!"=>1);
 my %DELAYED_SENTENCE_START = ("("=>1,"["=>1,"\""=>1,"'"=>1,"&quot;"=>1,"&apos;"=>1,"&#91;"=>1,"&#93;"=>1);
--- a/scripts/recaser/recase.perl
+++ b/scripts/recaser/recase.perl
@ -4,7 +4,7 @@
 use strict;
 use Getopt::Long "GetOptions";

-my ($SRC,$INFILE,$RECASE_MODEL);
+my ($SRC,$INFILE,$RECASE_MODEL,$UNBUFFERED);
 my $MOSES = "moses";
 my $LANGUAGE = "en"; # English by default;
 die("recase.perl --in file --model ini-file > out")
@ -12,9 +12,11 @@ die("recase.perl --in file --model ini-file > out")
                       'headline=s' => \$SRC,
                       'lang=s' => \$LANGUAGE,
 		       'moses=s' => \$MOSES,
-                       'model=s' => \$RECASE_MODEL)
+                       'model=s' => \$RECASE_MODEL,
+                       'b|unbuffered' => \$UNBUFFERED)
    && defined($INFILE)
    && defined($RECASE_MODEL);
+if (defined($UNBUFFERED) && $UNBUFFERED) { $|=1; }

 my %treated_languages = map { ($_,1) } qw/en cs/;
 die "I don't know any rules for $LANGUAGE. Use 'en' as the default."
--- a/scripts/recaser/truecase.perl
+++ b/scripts/recaser/truecase.perl
@ -8,9 +8,11 @@ binmode(STDIN, ":utf8");
 binmode(STDOUT, ":utf8");

 # apply switches
-my $MODEL;
-die("truecase.perl --model truecaser < in > out")
-    unless &GetOptions('model=s' => \$MODEL);
+my ($MODEL, $UNBUFFERED);
+die("truecase.perl --model MODEL [-b] < in > out")
+    unless &GetOptions('model=s' => \$MODEL,'b|unbuffered' => \$UNBUFFERED)
+    && defined($MODEL);
+if (defined($UNBUFFERED) && $UNBUFFERED) { $|=1; }

 my (%BEST,%KNOWN);
 open(MODEL,$MODEL) || die("ERROR: could not open '$MODEL'");
--- a/scripts/tokenizer/tokenizer.perl
+++ b/scripts/tokenizer/tokenizer.perl
@ -171,7 +171,7 @@ if ($TIMING)

 # tokenize a batch of texts saved in an array
 # input: an array containing a batch of texts
-# return: another array cotaining a batch of tokenized texts for the input array
+# return: another array containing a batch of tokenized texts for the input array
 sub tokenize_batch
 {
    my(@text_list) = @_;
--- a/scripts/training/clean-corpus-n.perl
+++ b/scripts/training/clean-corpus-n.perl
@ -47,7 +47,7 @@ my $l1input = "$corpus.$l1";
 if (-e $l1input) {
  $opn = $l1input;
 } elsif (-e $l1input.".gz") {
-  $opn = "zcat $l1input.gz |";
+  $opn = "gunzip -c $l1input.gz |";
 } else {
    die "Error: $l1input does not exist";
 }
@ -57,7 +57,7 @@ my $l2input = "$corpus.$l2";
 if (-e $l2input) {
  $opn = $l2input;
 } elsif (-e $l2input.".gz") {
-  $opn = "zcat $l2input.gz |";
+  $opn = "gunzip -c $l2input.gz |";
 } else  {
 die "Error: $l2input does not exist";
 }
@ -160,3 +160,4 @@ sub word_count {
  my @w = split(/ /,$line);
  return scalar @w;
 }
+
--- a/scripts/training/filter-rule-table.py
+++ b/scripts/training/filter-rule-table.py
@ -40,7 +40,8 @@ def printUsage():
 def main():
    parser = optparse.OptionParser()
    parser.add_option("-c", "--min-non-initial-rule-count",
-                      action="store", dest="minCount", type="int", default="1",
+                      action="store", dest="minCount",
+                      type="float", default="0.0",
                      help="prune non-initial rules where count is below N",
                      metavar="N")
    (options, args) = parser.parse_args()
				`@ -0,0 +1 @@`
				`Subproject commit a7084b686f5196f1bbac5d389b4a6cd7f15c83fb`