mosesdecoder/contrib/promix/main.py
2013-03-22 10:35:19 +00:00

113 lines
3.7 KiB
Python
Executable File

#!/usr/bin/env python
#
# Implementation of PRO training and extensions to train phrase weights
#
import gzip
import logging
from numpy import array
import optparse
import os.path
import sys
from nbest import *
from sampler import *
from train import *
logging.basicConfig(format = "%(asctime)-15s %(message)s")
log = logging.getLogger('main')
log.setLevel(logging.DEBUG)
class Config:
def __init__(self):
self.parser = optparse.OptionParser(usage="%prog [options] ")
self.parser.add_option("-t", "--trainer", action="store",\
dest="trainer", metavar="TYPE", type="choice", choices=("pro","mix"),\
default="pro",\
help="type of trainer to run (pro,mix)")
self.parser.add_option("-n", "--nbest", action="append", \
dest="nbest", metavar="NBEST-FILE",\
help="nbest output file(s) from decoder")
self.parser.add_option("-S", "--scfile", action="append",\
dest="score", metavar="SCORE-FILE",\
help="score file(s) from extractor (in same order as nbests)")
self.parser.add_option("-p", "--phrase-table" , action="append",\
dest="ttable", metavar="TTABLE",\
help="ttable to be used in mixture model training")
self.parser.add_option("-i", "--input-file", action="store",\
dest="input_file", metavar="INPUT-FILE",
help="source text file")
self.parser.add_option("-m", "--moses-bin-dir", action="store",\
dest="moses_bin_dir", metavar="DIR",
help="directory containing Moses binaries",
default=os.path.expanduser("~/moses/bin"))
self.nbest_files = []
self.score_files = []
self.ttables = []
def parse(self,args=sys.argv[1:]):
(options,args) = self.parser.parse_args(args)
self.nbest_files = options.nbest
self.score_files = options.score
self.ttables = options.ttable
self.input_file = options.input_file
self.trainer = options.trainer
self.moses_bin_dir = options.moses_bin_dir
if not self.nbest_files:
self.nbest_files = ["data/esen.nc.nbest.segment"]
if not self.score_files:
self.score_files = ["data/esen.nc.scores"]
if len(self.nbest_files) != len(self.score_files):
self.parser.error("Must have equal numbers of score files and nbest files")
if self.trainer == "mix":
if not self.input_file or not self.ttables:
self.parser.error("Need to specify input file and ttables for mix training")
#if len(self.ttables) != 2:
# self.parser.error("Can only train mix model with 2 ttables at the moment")
def main():
config = Config()
config.parse()
samples = []
sampler = HopkinsMaySampler()
nbests = 0
for nbest_file,score_data_file in zip(config.nbest_files,config.score_files):
log.debug("nbest: " + nbest_file + "; score:" + score_data_file)
segments = False
if config.trainer == "mix": segments = True
for nbest in get_scored_nbests(nbest_file, score_data_file, config.input_file, segments=segments):
samples += sampler.sample(nbest)
nbests += 1
log.debug("Samples loaded")
trainer = None
if config.trainer == "mix":
# Add the phrase table scores
scorer = MosesPhraseScorer(config.ttables)
log.debug("Scoring samples...")
for sample in samples:
scorer.add_scores(sample.hyp1)
scorer.add_scores(sample.hyp2)
log.debug("...samples scored")
trainer = MixtureModelTrainer(samples)
elif config.trainer == "pro":
trainer = ProTrainer(samples)
else: assert(0)
log.debug("Starting training...")
weights,mix_weights = trainer.train(debug=False)
log.debug("...training complete")
for i,w in enumerate(weights):
print "F%d %10.8f" % (i,w)
for i,f in enumerate(mix_weights):
for j,w in enumerate(f):
print "M%d_%d %10.8f" % (i,j,w)
if __name__ == "__main__":
main()