mosesdecoder/contrib/promix/main.py

#!/usr/bin/env python

#
# Implementation of PRO training and extensions to train phrase weights
#

import gzip
import logging
from numpy import array
import optparse
import os.path
import sys

from nbest import *
from sampler import *
from train import *


logging.basicConfig(format = "%(asctime)-15s %(message)s")
log = logging.getLogger('main')
log.setLevel(logging.DEBUG)

class Config:
  def __init__(self):
    self.parser = optparse.OptionParser(usage="%prog [options] ")
    self.parser.add_option("-t", "--trainer", action="store",\
      dest="trainer", metavar="TYPE", type="choice", choices=("pro","mix"),\
      default="pro",\
      help="type of trainer to run (pro,mix)")
    self.parser.add_option("-n", "--nbest", action="append", \
      dest="nbest", metavar="NBEST-FILE",\
      help="nbest output file(s) from decoder")
    self.parser.add_option("-S", "--scfile", action="append",\
      dest="score", metavar="SCORE-FILE",\
      help="score file(s) from extractor (in same order as nbests)")
    self.parser.add_option("-p", "--phrase-table" , action="append",\
      dest="ttable", metavar="TTABLE",\
      help="ttable to be used in mixture model training")
    self.parser.add_option("-i", "--input-file", action="store",\
      dest="input_file", metavar="INPUT-FILE",
      help="source text file")
    self.parser.add_option("-m", "--moses-bin-dir", action="store",\
      dest="moses_bin_dir", metavar="DIR",
      help="directory containing Moses binaries",
      default=os.path.expanduser("~/moses/bin"))
    self.nbest_files = []
    self.score_files = []
    self.ttables = []

  def parse(self,args=sys.argv[1:]):
    (options,args) = self.parser.parse_args(args)
    self.nbest_files = options.nbest
    self.score_files = options.score
    self.ttables = options.ttable
    self.input_file = options.input_file
    self.trainer = options.trainer
    self.moses_bin_dir = options.moses_bin_dir
    if not self.nbest_files:
      self.nbest_files = ["data/esen.nc.nbest.segment"]
    if not self.score_files:
      self.score_files = ["data/esen.nc.scores"]
    if len(self.nbest_files) != len(self.score_files):
      self.parser.error("Must have equal numbers of score files and nbest files")
    if self.trainer == "mix":
      if not self.input_file or not self.ttables:
        self.parser.error("Need to specify input file and ttables for mix training")
      #if len(self.ttables) != 2:
      #  self.parser.error("Can only train mix model with 2 ttables at the moment")

def main():
  config = Config()
  config.parse()

  samples = []
  sampler = HopkinsMaySampler()
  nbests = 0
  for nbest_file,score_data_file in zip(config.nbest_files,config.score_files):
    log.debug("nbest: " + nbest_file + "; score:" + score_data_file)
    segments = False
    if config.trainer == "mix": segments = True
    for nbest in get_scored_nbests(nbest_file, score_data_file, config.input_file, segments=segments):
        samples += sampler.sample(nbest)
        nbests += 1
    log.debug("Samples loaded")
  trainer = None
  if config.trainer == "mix":
    # Add the phrase table scores
    scorer = MosesPhraseScorer(config.ttables)
    log.debug("Scoring samples...")
    for sample in samples:
      scorer.add_scores(sample.hyp1)
      scorer.add_scores(sample.hyp2)
    log.debug("...samples scored")
    trainer = MixtureModelTrainer(samples)
  elif config.trainer == "pro":
    trainer = ProTrainer(samples)
  else: assert(0)
  log.debug("Starting training...")
  weights,mix_weights = trainer.train(debug=False)
  log.debug("...training complete")
  for i,w in enumerate(weights):
    print "F%d %10.8f" % (i,w)
  for i,f in enumerate(mix_weights):
    for j,w in enumerate(f):
      print "M%d_%d %10.8f" % (i,j,w)


if __name__ == "__main__":
  main()