2012-01-31 13:50:20 +04:00
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Author: Rico Sennrich <sennrich [AT] cl.uzh.ch>
# This program handles the combination of Moses phrase tables, either through
# linear interpolation of the phrase translation probabilities/lexical weights,
# or through a recomputation based on the (weighted) combined counts.
#
# It also supports an automatic search for weights that minimize the cross-entropy
# between the model and a tuning set of word/phrase alignments.
# for usage information, run
# python tmcombine.py -h
# you can also check the docstrings of Combine_TMs() for more information and find some example commands in the function test()
# Some general things to note:
2013-04-09 13:15:28 +04:00
# - Different combination algorithms require different statistics. To be on the safe side, use the option `-write-lexical-counts` when training models.
2012-01-31 13:50:20 +04:00
# - The script assumes that phrase tables are sorted (to allow incremental, more memory-friendly processing). sort with LC_ALL=C.
# - Some configurations require additional statistics that are loaded in memory (lexical tables; complete list of target phrases). If memory consumption is a problem, use the option --lowmem (slightly slower and writes temporary files to disk), or consider pruning your phrase table before combining (e.g. using Johnson et al. 2007).
# - The script can read/write gzipped files, but the Python implementation is slow. You're better off unzipping the files on the command line and working with the unzipped files.
# - The cross-entropy estimation assumes that phrase tables contain true probability distributions (i.e. a probability mass of 1 for each conditional probability distribution). If this is not true, the results are skewed.
# - Unknown phrase pairs are not considered for the cross-entropy estimation. A comparison of models with different vocabularies may be misleading.
# - Don't directly compare cross-entropies obtained from a combination with different modes. Depending on how some corner cases are treated, linear interpolation does not distribute full probability mass and thus shows higher (i.e. worse) cross-entropies.
from __future__ import division , unicode_literals
import sys
import os
import gzip
import argparse
import copy
import re
from math import log , exp
from collections import defaultdict
from operator import mul
from tempfile import NamedTemporaryFile
from subprocess import Popen
2012-05-31 16:23:38 +04:00
try :
from itertools import izip
except :
izip = zip
2012-01-31 13:50:20 +04:00
try :
from lxml import etree as ET
except :
import xml . etree . cElementTree as ET
try :
from scipy . optimize . lbfgsb import fmin_l_bfgs_b
optimizer = ' l-bfgs '
except :
optimizer = ' hillclimb '
class Moses ( ) :
""" Moses interface for loading/writing models
to support other phrase table formats , subclass this and overwrite the relevant functions
"""
def __init__ ( self , models , number_of_features ) :
self . number_of_features = number_of_features
self . models = models
#example item (assuming mode=='counts' and one feature): phrase_pairs['the house']['das haus'] = [[[10,100]],['0-0 1-1']]
self . phrase_pairs = defaultdict ( lambda : defaultdict ( lambda : [ [ [ 0 ] * len ( self . models ) for i in range ( self . number_of_features ) ] , [ ] ] ) )
self . phrase_source = defaultdict ( lambda : [ 0 ] * len ( self . models ) )
self . phrase_target = defaultdict ( lambda : [ 0 ] * len ( self . models ) )
self . reordering_pairs = defaultdict ( lambda : defaultdict ( lambda : [ [ 0 ] * len ( self . models ) for i in range ( self . number_of_features ) ] ) )
self . word_pairs_e2f = defaultdict ( lambda : defaultdict ( lambda : [ 0 ] * len ( self . models ) ) )
self . word_pairs_f2e = defaultdict ( lambda : defaultdict ( lambda : [ 0 ] * len ( self . models ) ) )
self . word_source = defaultdict ( lambda : [ 0 ] * len ( self . models ) )
self . word_target = defaultdict ( lambda : [ 0 ] * len ( self . models ) )
self . require_alignment = False
def open_table ( self , model , table , mode = ' r ' ) :
""" define which paths to open for lexical tables and phrase tables.
we assume canonical Moses structure , but feel free to overwrite this
"""
if table == ' reordering-table ' :
table = ' reordering-table.wbe-msd-bidirectional-fe '
filename = os . path . join ( model , ' model ' , table )
fileobj = handle_file ( filename , ' open ' , mode )
return fileobj
2012-05-31 16:23:38 +04:00
def load_phrase_features ( self , line , priority , i , mode = ' interpolate ' , store = ' pairs ' , filter_by = None , filter_by_src = None , filter_by_target = None , inverted = False , flags = None ) :
2012-01-31 13:50:20 +04:00
""" take single phrase table line and store probablities in internal data structure """
src = line [ 0 ]
target = line [ 1 ]
if inverted :
src , target = target , src
if ( store == ' all ' or store == ' pairs ' ) and ( priority < 10 or ( src in self . phrase_pairs and target in self . phrase_pairs [ src ] ) ) and not ( filter_by and not ( src in filter_by and target in filter_by [ src ] ) ) :
self . store_info ( src , target , line )
2012-05-31 16:23:38 +04:00
scores = line [ 2 ] . split ( )
if len ( scores ) < self . number_of_features :
sys . stderr . write ( ' Error: model only has {0} features. Expected {1} . \n ' . format ( len ( scores ) , self . number_of_features ) )
2014-01-30 18:42:31 +04:00
exit ( 1 )
2012-05-31 16:23:38 +04:00
scores = scores [ : self . number_of_features ]
2014-06-06 13:32:23 +04:00
model_probabilities = list ( map ( float , scores ) )
2012-01-31 13:50:20 +04:00
phrase_probabilities = self . phrase_pairs [ src ] [ target ] [ 0 ]
2012-05-31 16:23:38 +04:00
if mode == ' counts ' and not priority == 2 : #priority 2 is MAP
try :
2014-06-06 13:32:23 +04:00
counts = list ( map ( float , line [ 4 ] . split ( ) ) )
2012-10-18 18:09:38 +04:00
try :
target_count , src_count , joint_count = counts
joint_count_e2f = joint_count
joint_count_f2e = joint_count
except ValueError :
# possibly old-style phrase table with 2 counts in last column, or phrase table produced by tmcombine
# note: since each feature has different weight vector, we may have two different phrase pair frequencies
target_count , src_count = counts
i_e2f = flags [ ' i_e2f ' ]
i_f2e = flags [ ' i_f2e ' ]
joint_count_e2f = model_probabilities [ i_e2f ] * target_count
joint_count_f2e = model_probabilities [ i_f2e ] * src_count
2012-05-31 16:23:38 +04:00
except :
2012-10-18 18:09:38 +04:00
sys . stderr . write ( str ( b " ||| " . join ( line ) ) + b ' \n ' )
2013-02-05 14:44:50 +04:00
sys . stderr . write ( ' ERROR: counts are missing or misformatted. Maybe your phrase table is from an older Moses version that doesn \' t store counts or word alignment? \n ' )
2012-10-18 18:09:38 +04:00
raise
2012-05-31 16:23:38 +04:00
i_e2f = flags [ ' i_e2f ' ]
i_f2e = flags [ ' i_f2e ' ]
2012-10-18 18:09:38 +04:00
model_probabilities [ i_e2f ] = joint_count_e2f
model_probabilities [ i_f2e ] = joint_count_f2e
2012-01-31 13:50:20 +04:00
for j , p in enumerate ( model_probabilities ) :
phrase_probabilities [ j ] [ i ] = p
# mark that the src/target phrase has been seen.
# needed for re-normalization during linear interpolation
if ( store == ' all ' or store == ' source ' ) and not ( filter_by_src and not src in filter_by_src ) :
2012-05-31 16:23:38 +04:00
if mode == ' counts ' and not priority == 2 : #priority 2 is MAP
try :
2014-03-27 18:32:13 +04:00
self . phrase_source [ src ] [ i ] = float ( line [ 4 ] . split ( ) [ 1 ] )
2012-05-31 16:23:38 +04:00
except :
sys . stderr . write ( str ( line ) + ' \n ' )
2013-02-05 14:44:50 +04:00
sys . stderr . write ( ' ERROR: Counts are missing or misformatted. Maybe your phrase table is from an older Moses version that doesn \' t store counts or word alignment? \n ' )
raise
2012-05-31 16:23:38 +04:00
else :
self . phrase_source [ src ] [ i ] = 1
2012-01-31 13:50:20 +04:00
if ( store == ' all ' or store == ' target ' ) and not ( filter_by_target and not target in filter_by_target ) :
2012-05-31 16:23:38 +04:00
if mode == ' counts ' and not priority == 2 : #priority 2 is MAP
try :
2014-03-27 18:32:13 +04:00
self . phrase_target [ target ] [ i ] = float ( line [ 4 ] . split ( ) [ 0 ] )
2012-05-31 16:23:38 +04:00
except :
sys . stderr . write ( str ( line ) + ' \n ' )
2013-02-05 14:44:50 +04:00
sys . stderr . write ( ' ERROR: Counts are missing or misformatted. Maybe your phrase table is from an older Moses version that doesn \' t store counts or word alignment? \n ' )
raise
2012-05-31 16:23:38 +04:00
else :
self . phrase_target [ target ] [ i ] = 1
2012-01-31 13:50:20 +04:00
2012-07-19 12:31:46 +04:00
def load_reordering_probabilities ( self , line , priority , i , * * unused ) :
2012-01-31 13:50:20 +04:00
""" take single reordering table line and store probablities in internal data structure """
src = line [ 0 ]
target = line [ 1 ]
2014-06-06 13:32:23 +04:00
model_probabilities = list ( map ( float , line [ 2 ] . split ( ) ) )
2012-01-31 13:50:20 +04:00
reordering_probabilities = self . reordering_pairs [ src ] [ target ]
2012-07-19 12:31:46 +04:00
try :
for j , p in enumerate ( model_probabilities ) :
reordering_probabilities [ j ] [ i ] = p
except IndexError :
sys . stderr . write ( ' \n IndexError: Did you correctly specify the number of reordering features? (--number_of_features N in command line) \n ' )
2014-01-30 18:42:31 +04:00
exit ( 1 )
2012-01-31 13:50:20 +04:00
2012-05-31 16:23:38 +04:00
def traverse_incrementally ( self , table , models , load_lines , store_flag , mode = ' interpolate ' , inverted = False , lowmem = False , flags = None ) :
2012-01-31 13:50:20 +04:00
""" hack-ish way to find common phrase pairs in multiple models in one traversal without storing it all in memory
relies on alphabetical sorting of phrase table .
"""
increment = - 1
stack = [ ' ' ] * len ( self . models )
while increment :
self . phrase_pairs = defaultdict ( lambda : defaultdict ( lambda : [ [ [ 0 ] * len ( self . models ) for i in range ( self . number_of_features ) ] , [ ] ] ) )
self . reordering_pairs = defaultdict ( lambda : defaultdict ( lambda : [ [ 0 ] * len ( self . models ) for i in range ( self . number_of_features ) ] ) )
self . phrase_source = defaultdict ( lambda : [ 0 ] * len ( self . models ) )
2012-02-27 17:11:47 +04:00
if lowmem :
self . phrase_target = defaultdict ( lambda : [ 0 ] * len ( self . models ) )
2012-01-31 13:50:20 +04:00
for model , priority , i in models :
if stack [ i ] :
if increment != stack [ i ] [ 0 ] :
continue
else :
2012-05-31 16:23:38 +04:00
load_lines ( stack [ i ] , priority , i , mode = mode , store = store_flag , inverted = inverted , flags = flags )
2012-01-31 13:50:20 +04:00
stack [ i ] = ' '
for line in model :
line = line . rstrip ( ) . split ( b ' ||| ' )
2014-03-27 18:32:13 +04:00
if line [ - 1 ] . endswith ( b ' ||| ' ) :
line [ - 1 ] = line [ - 1 ] [ : - 4 ]
2014-06-06 13:32:23 +04:00
line . append ( b ' ' )
2012-01-31 13:50:20 +04:00
if increment != line [ 0 ] :
stack [ i ] = line
break
2012-05-31 16:23:38 +04:00
load_lines ( line , priority , i , mode = mode , store = store_flag , inverted = inverted , flags = flags )
2012-01-31 13:50:20 +04:00
yield 1
#calculate which source phrase to process next
lines = [ line [ 0 ] + b ' | ' for line in stack if line ]
if lines :
increment = min ( lines ) [ : - 2 ]
else :
increment = None
def load_word_probabilities ( self , line , side , i , priority , e2f_filter = None , f2e_filter = None ) :
""" process single line of lexical table """
a , b , prob = line . split ( b ' ' )
2012-07-16 15:26:39 +04:00
if side == ' e2f ' and ( not e2f_filter or a in e2f_filter and b in e2f_filter [ a ] ) :
2012-01-31 13:50:20 +04:00
self . word_pairs_e2f [ a ] [ b ] [ i ] = float ( prob )
2012-07-16 15:26:39 +04:00
elif side == ' f2e ' and ( not f2e_filter or a in f2e_filter and b in f2e_filter [ a ] ) :
2012-01-31 13:50:20 +04:00
self . word_pairs_f2e [ a ] [ b ] [ i ] = float ( prob )
2012-05-31 16:23:38 +04:00
def load_word_counts ( self , line , side , i , priority , e2f_filter = None , f2e_filter = None , flags = None ) :
2012-01-31 13:50:20 +04:00
""" process single line of lexical table """
a , b , ab_count , b_count = line . split ( b ' ' )
if side == ' e2f ' :
if priority == 2 : #MAP
if not e2f_filter or a in e2f_filter :
if not e2f_filter or b in e2f_filter [ a ] :
self . word_pairs_e2f [ a ] [ b ] [ i ] = float ( ab_count ) / float ( b_count )
self . word_target [ b ] [ i ] = 1
else :
if not e2f_filter or a in e2f_filter :
if not e2f_filter or b in e2f_filter [ a ] :
self . word_pairs_e2f [ a ] [ b ] [ i ] = float ( ab_count )
self . word_target [ b ] [ i ] = float ( b_count )
elif side == ' f2e ' :
if priority == 2 : #MAP
if not f2e_filter or a in f2e_filter and b in f2e_filter [ a ] :
if not f2e_filter or b in f2e_filter [ a ] :
self . word_pairs_f2e [ a ] [ b ] [ i ] = float ( ab_count ) / float ( b_count )
self . word_source [ b ] [ i ] = 1
else :
if not f2e_filter or a in f2e_filter and b in f2e_filter [ a ] :
if not f2e_filter or b in f2e_filter [ a ] :
self . word_pairs_f2e [ a ] [ b ] [ i ] = float ( ab_count )
self . word_source [ b ] [ i ] = float ( b_count )
def load_lexical_tables ( self , models , mode , e2f_filter = None , f2e_filter = None ) :
""" open and load lexical tables into data structure """
if mode == ' counts ' :
files = [ ' lex.counts.e2f ' , ' lex.counts.f2e ' ]
load_lines = self . load_word_counts
else :
files = [ ' lex.e2f ' , ' lex.f2e ' ]
load_lines = self . load_word_probabilities
j = 0
for f in files :
models_prioritized = [ ( self . open_table ( model , f ) , priority , i ) for ( model , priority , i ) in priority_sort_models ( models ) ]
for model , priority , i in models_prioritized :
for line in model :
if not j % 100000 :
sys . stderr . write ( ' . ' )
j + = 1
load_lines ( line , f [ - 3 : ] , i , priority , e2f_filter = e2f_filter , f2e_filter = f2e_filter )
def store_info ( self , src , target , line ) :
""" store alignment info and comment section for re-use in output """
2014-03-27 18:32:13 +04:00
if len ( line ) > = 5 :
if not self . phrase_pairs [ src ] [ target ] [ 1 ] :
self . phrase_pairs [ src ] [ target ] [ 1 ] = line [ 3 : ]
2012-01-31 13:50:20 +04:00
# assuming that alignment is empty
elif len ( line ) == 4 :
if self . require_alignment :
2013-04-09 13:15:28 +04:00
sys . stderr . write ( ' Error: unexpected phrase table format. Your current configuration requires alignment information. Make sure you trained your model with -phrase-word-alignment (default in newer Moses versions) \n ' )
2014-01-30 18:42:31 +04:00
exit ( 1 )
2012-01-31 13:50:20 +04:00
2012-07-01 22:48:12 +04:00
self . phrase_pairs [ src ] [ target ] [ 1 ] = [ b ' ' , line [ 3 ] . lstrip ( b ' | ' ) ]
2012-01-31 13:50:20 +04:00
else :
sys . stderr . write ( ' Error: unexpected phrase table format. Are you using a very old/new version of Moses with different formatting? \n ' )
2014-01-30 18:42:31 +04:00
exit ( 1 )
2012-01-31 13:50:20 +04:00
def get_word_alignments ( self , src , target , cache = False , mycache = { } ) :
""" from the Moses phrase table alignment info in the form " 0-0 1-0 " ,
get the aligned word pairs / NULL alignments
"""
if cache :
if ( src , target ) in mycache :
return mycache [ ( src , target ) ]
try :
alignment = self . phrase_pairs [ src ] [ target ] [ 1 ] [ 0 ]
except :
return None , None
src_list = src . split ( b ' ' )
target_list = target . split ( b ' ' )
textual_e2f = [ [ s , [ ] ] for s in src_list ]
textual_f2e = [ [ t , [ ] ] for t in target_list ]
for pair in alignment . split ( b ' ' ) :
2014-06-06 13:32:23 +04:00
s , t = pair . split ( b ' - ' )
2012-01-31 13:50:20 +04:00
s , t = int ( s ) , int ( t )
textual_e2f [ s ] [ 1 ] . append ( target_list [ t ] )
textual_f2e [ t ] [ 1 ] . append ( src_list [ s ] )
for s , t in textual_e2f :
if not t :
2014-06-06 13:32:23 +04:00
t . append ( b ' NULL ' )
2012-01-31 13:50:20 +04:00
for s , t in textual_f2e :
if not t :
2014-06-06 13:32:23 +04:00
t . append ( b ' NULL ' )
2012-01-31 13:50:20 +04:00
#tupelize so we can use the value as dictionary keys
for i in range ( len ( textual_e2f ) ) :
textual_e2f [ i ] [ 1 ] = tuple ( textual_e2f [ i ] [ 1 ] )
for i in range ( len ( textual_f2e ) ) :
textual_f2e [ i ] [ 1 ] = tuple ( textual_f2e [ i ] [ 1 ] )
if cache :
mycache [ ( src , target ) ] = textual_e2f , textual_f2e
return textual_e2f , textual_f2e
def write_phrase_table ( self , src , target , weights , features , mode , flags ) :
""" convert data to string in Moses phrase table format """
# if one feature value is 0 (either because of loglinear interpolation or rounding to 0), don't write it to phrasetable
# (phrase pair will end up with probability zero in log-linear model anyway)
if 0 in features :
2014-06-06 13:32:23 +04:00
return b ' '
2012-01-31 13:50:20 +04:00
# information specific to Moses model: alignment info and comment section with target and source counts
2014-03-27 18:32:13 +04:00
additional_entries = self . phrase_pairs [ src ] [ target ] [ 1 ]
alignment = additional_entries [ 0 ]
2012-09-11 16:33:30 +04:00
if alignment :
extra_space = b ' '
else :
extra_space = b ' '
2012-01-31 13:50:20 +04:00
if mode == ' counts ' :
2012-05-31 16:23:38 +04:00
i_e2f = flags [ ' i_e2f ' ]
i_f2e = flags [ ' i_f2e ' ]
srccount = dot_product ( self . phrase_source [ src ] , weights [ i_f2e ] )
targetcount = dot_product ( self . phrase_target [ target ] , weights [ i_e2f ] )
2014-03-27 18:32:13 +04:00
additional_entries [ 1 ] = b " %s %s " % ( targetcount , srccount )
2012-01-31 13:50:20 +04:00
features = b ' ' . join ( [ b ' %.6g ' % ( f ) for f in features ] )
if flags [ ' add_origin_features ' ] :
2014-06-06 13:32:23 +04:00
origin_features = list ( map ( lambda x : 2.718 * * bool ( x ) , self . phrase_pairs [ src ] [ target ] [ 0 ] [ 0 ] ) ) # 1 if phrase pair doesn't occur in model, 2.718 if it does
2012-01-31 13:50:20 +04:00
origin_features = b ' ' . join ( [ b ' %.4f ' % ( f ) for f in origin_features ] ) + ' '
else :
origin_features = b ' '
2013-10-25 00:12:47 +04:00
if flags [ ' write_phrase_penalty ' ] :
phrase_penalty = b ' 2.718 '
else :
phrase_penalty = b ' '
2014-03-27 18:32:13 +04:00
line = b " %s ||| %s ||| %s %s %s ||| %s %s ||| %s \n " % ( src , target , features , origin_features , phrase_penalty , alignment , extra_space , b ' ||| ' . join ( additional_entries [ 1 : ] ) )
2012-01-31 13:50:20 +04:00
return line
def write_lexical_file ( self , direction , path , weights , mode ) :
if mode == ' counts ' :
bridge = ' .counts '
else :
bridge = ' '
fobj = handle_file ( " {0} {1} . {2} " . format ( path , bridge , direction ) , ' open ' , mode = ' w ' )
sys . stderr . write ( ' Writing {0} {1} . {2} \n ' . format ( path , bridge , direction ) )
if direction == ' e2f ' :
word_pairs = self . word_pairs_e2f
marginal = self . word_target
elif direction == ' f2e ' :
word_pairs = self . word_pairs_f2e
marginal = self . word_source
for x in sorted ( word_pairs ) :
for y in sorted ( word_pairs [ x ] ) :
xy = dot_product ( word_pairs [ x ] [ y ] , weights )
fobj . write ( b " %s %s %s " % ( x , y , xy ) )
if mode == ' counts ' :
fobj . write ( b " %s \n " % ( dot_product ( marginal [ y ] , weights ) ) )
else :
fobj . write ( b ' \n ' )
handle_file ( " {0} {1} . {2} " . format ( path , bridge , direction ) , ' close ' , fobj , mode = ' w ' )
def write_reordering_table ( self , src , target , features ) :
""" convert data to string in Moses reordering table format """
# if one feature value is 0 (either because of loglinear interpolation or rounding to 0), don't write it to reordering table
# (phrase pair will end up with probability zero in log-linear model anyway)
if 0 in features :
2014-06-06 13:32:23 +04:00
return b ' '
2012-01-31 13:50:20 +04:00
2012-07-19 12:31:46 +04:00
features = b ' ' . join ( [ b ' %.6g ' % ( f ) for f in features ] )
2012-01-31 13:50:20 +04:00
line = b " %s ||| %s ||| %s \n " % ( src , target , features )
return line
2012-02-27 17:11:47 +04:00
def create_inverse ( self , fobj , tempdir = None ) :
2012-01-31 13:50:20 +04:00
""" swap source and target phrase in the phrase table, and then sort (by target phrase) """
2012-02-27 17:11:47 +04:00
inverse = NamedTemporaryFile ( prefix = ' inv_unsorted ' , delete = False , dir = tempdir )
2012-01-31 13:50:20 +04:00
swap = re . compile ( b ' (.+?) \ | \ | \ | (.+?) \ | \ | \ | ' )
# just swap source and target phrase, and leave order of scores etc. intact.
# For better compatibility with existing codebase, we swap the order of the phrases back for processing
for line in fobj :
inverse . write ( swap . sub ( b ' \\ 2 ||| \\ 1 ||| ' , line , 1 ) )
inverse . close ( )
2012-02-27 17:11:47 +04:00
inverse_sorted = sort_file ( inverse . name , tempdir = tempdir )
2012-01-31 13:50:20 +04:00
os . remove ( inverse . name )
return inverse_sorted
def merge ( self , pt_normal , pt_inverse , pt_out , mode = ' interpolate ' ) :
2012-05-31 16:23:38 +04:00
""" merge two phrasetables (the latter having been inverted to calculate p(s|t) and lex(s|t) in sorted order)
Assumes that p ( s | t ) and lex ( s | t ) are in first table half , p ( t | s ) and lex ( t | s ) in second """
2012-01-31 13:50:20 +04:00
for line , line2 in izip ( pt_normal , pt_inverse ) :
line = line . split ( b ' ||| ' )
2014-03-27 18:32:13 +04:00
if line [ - 1 ] . endswith ( b ' ||| ' ) :
line [ - 1 ] = line [ - 1 ] [ : - 4 ]
line . append ( ' ' )
2012-01-31 13:50:20 +04:00
line2 = line2 . split ( b ' ||| ' )
2014-03-27 18:32:13 +04:00
if line2 [ - 1 ] . endswith ( b ' ||| ' ) :
line2 [ - 1 ] = line2 [ - 1 ] [ : - 4 ]
line2 . append ( ' ' )
2012-01-31 13:50:20 +04:00
#scores
2012-05-31 16:23:38 +04:00
mid = int ( self . number_of_features / 2 )
2012-01-31 13:50:20 +04:00
scores1 = line [ 2 ] . split ( )
scores2 = line2 [ 2 ] . split ( )
2012-05-31 16:23:38 +04:00
line [ 2 ] = b ' ' . join ( scores2 [ : mid ] + scores1 [ mid : ] )
2012-01-31 13:50:20 +04:00
# marginal counts
if mode == ' counts ' :
2014-03-27 18:32:13 +04:00
src_count = line [ 4 ] . split ( ) [ 1 ]
2012-01-31 13:50:20 +04:00
target_count = line2 [ - 1 ] . split ( ) [ 0 ]
2014-03-27 18:32:13 +04:00
line [ 4 ] = b ' ' . join ( [ target_count , src_count ] )
2012-01-31 13:50:20 +04:00
2014-03-27 18:32:13 +04:00
pt_out . write ( b ' ||| ' . join ( line ) + b ' \n ' )
2012-01-31 13:50:20 +04:00
pt_normal . close ( )
pt_inverse . close ( )
pt_out . close ( )
class TigerXML ( ) :
""" interface to load reference word alignments from TigerXML corpus.
Tested on SMULTRON ( http : / / kitt . cl . uzh . ch / kitt / smultron / )
"""
def __init__ ( self , alignment_xml ) :
""" only argument is TigerXML file
"""
self . treebanks = self . _open_treebanks ( alignment_xml )
self . word_pairs = defaultdict ( lambda : defaultdict ( int ) )
self . word_source = defaultdict ( int )
self . word_target = defaultdict ( int )
def load_word_pairs ( self , src , target ) :
""" load word pairs. src and target are the itentifiers of the source and target language in the XML """
if not src or not target :
sys . stderr . write ( ' Error: Source and/or target language not specified. Required for TigerXML extraction. \n ' )
2014-01-30 18:42:31 +04:00
exit ( 1 )
2012-01-31 13:50:20 +04:00
alignments = self . _get_aligned_ids ( src , target )
self . _textualize_alignments ( src , target , alignments )
def _open_treebanks ( self , alignment_xml ) :
""" Parallel XML format references monolingual files. Open all. """
alignment_path = os . path . dirname ( alignment_xml )
align_xml = ET . parse ( alignment_xml )
treebanks = { }
treebanks [ ' aligned ' ] = align_xml
for treebank in align_xml . findall ( ' //treebank ' ) :
treebank_id = treebank . get ( ' id ' )
filename = treebank . get ( ' filename ' )
if not os . path . isabs ( filename ) :
filename = os . path . join ( alignment_path , filename )
treebanks [ treebank_id ] = ET . parse ( filename )
return treebanks
def _get_aligned_ids ( self , src , target ) :
""" first step: find which nodes are aligned. """
alignments = [ ]
ids = defaultdict ( dict )
for alignment in self . treebanks [ ' aligned ' ] . findall ( ' //align ' ) :
newpair = { }
if len ( alignment ) != 2 :
sys . stderr . write ( ' Error: alignment with ' + str ( len ( alignment ) ) + ' children. Expected 2. Skipping. \n ' )
continue
for node in alignment :
lang = node . get ( ' treebank_id ' )
node_id = node . get ( ' node_id ' )
newpair [ lang ] = node_id
if not ( src in newpair and target in newpair ) :
sys . stderr . write ( ' Error: source and target languages don \' t match. Skipping. \n ' )
continue
# every token may only appear in one alignment pair;
# if it occurs in multiple, we interpret them as one 1-to-many or many-to-1 alignment
if newpair [ src ] in ids [ src ] :
idx = ids [ src ] [ newpair [ src ] ]
alignments [ idx ] [ 1 ] . append ( newpair [ target ] )
elif newpair [ target ] in ids [ target ] :
idx = ids [ target ] [ newpair [ target ] ]
alignments [ idx ] [ 0 ] . append ( newpair [ src ] )
else :
idx = len ( alignments )
alignments . append ( ( [ newpair [ src ] ] , [ newpair [ target ] ] ) )
ids [ src ] [ newpair [ src ] ] = idx
ids [ target ] [ newpair [ target ] ] = idx
alignments = self . _discard_discontinuous ( alignments )
return alignments
def _discard_discontinuous ( self , alignments ) :
""" discard discontinuous word sequences (which we can ' t use for phrase-based SMT systems)
and make sure that sequence is in correct order .
"""
new_alignments = [ ]
for alignment in alignments :
new_pair = [ ]
for sequence in alignment :
sequence_split = [ t_id . split ( ' _ ' ) for t_id in sequence ]
#check if all words come from the same sentence
sentences = [ item [ 0 ] for item in sequence_split ]
if not len ( set ( sentences ) ) == 1 :
#sys.stderr.write('Warning. Word sequence crossing sentence boundary. Discarding.\n')
#sys.stderr.write(str(sequence)+'\n')
continue
#sort words and check for discontinuities.
try :
tokens = sorted ( [ int ( item [ 1 ] ) for item in sequence_split ] )
except ValueError :
#sys.stderr.write('Warning. Not valid word IDs. Discarding.\n')
#sys.stderr.write(str(sequence)+'\n')
continue
if not tokens [ - 1 ] - tokens [ 0 ] == len ( tokens ) - 1 :
#sys.stderr.write('Warning. Discontinuous word sequence(?). Discarding.\n')
#sys.stderr.write(str(sequence)+'\n')
continue
out_sequence = [ sentences [ 0 ] + ' _ ' + str ( token ) for token in tokens ]
new_pair . append ( out_sequence )
if len ( new_pair ) == 2 :
new_alignments . append ( new_pair )
return new_alignments
def _textualize_alignments ( self , src , target , alignments ) :
""" Knowing which nodes are aligned, get actual words that are aligned. """
words = defaultdict ( dict )
for text in [ text for text in self . treebanks if not text == ' aligned ' ] :
#TODO: Make lowercasing optional
for terminal in self . treebanks [ text ] . findall ( ' //t ' ) :
words [ text ] [ terminal . get ( ' id ' ) ] = terminal . get ( ' word ' ) . lower ( )
for ( src_ids , target_ids ) in alignments :
try :
src_text = ' ' . join ( ( words [ src ] [ src_id ] for src_id in src_ids ) )
except KeyError :
#sys.stderr.write('Warning. ID not found: '+ str(src_ids) +'\n')
continue
try :
target_text = ' ' . join ( ( words [ target ] [ target_id ] for target_id in target_ids ) )
except KeyError :
#sys.stderr.write('Warning. ID not found: '+ str(target_ids) +'\n')
continue
self . word_pairs [ src_text ] [ target_text ] + = 1
self . word_source [ src_text ] + = 1
self . word_target [ target_text ] + = 1
class Moses_Alignment ( ) :
""" interface to load reference phrase alignment from corpus aligend with Giza++
and with extraction heuristics as applied by the Moses toolkit .
"""
def __init__ ( self , alignment_file ) :
self . alignment_file = alignment_file
self . word_pairs = defaultdict ( lambda : defaultdict ( int ) )
self . word_source = defaultdict ( int )
self . word_target = defaultdict ( int )
def load_word_pairs ( self , src_lang , target_lang ) :
""" main function. overwrite this to import data in different format. """
fileobj = handle_file ( self . alignment_file , ' open ' , ' r ' )
for line in fileobj :
line = line . split ( b ' ||| ' )
2014-03-27 18:32:13 +04:00
if line [ - 1 ] . endswith ( b ' ||| ' ) :
line [ - 1 ] = line [ - 1 ] [ : - 4 ]
2014-06-06 13:32:23 +04:00
line . append ( b ' ' )
2014-03-27 18:32:13 +04:00
2012-01-31 13:50:20 +04:00
src = line [ 0 ]
target = line [ 1 ]
self . word_pairs [ src ] [ target ] + = 1
self . word_source [ src ] + = 1
self . word_target [ target ] + = 1
def dot_product ( a , b ) :
""" calculate dot product from two lists """
# optimized for PyPy (much faster than enumerate/map)
s = 0
i = 0
for x in a :
s + = x * b [ i ]
i + = 1
return s
def priority_sort_models ( models ) :
""" primary models should have priority before supplementary models.
zipped with index to know which weight model belongs to
"""
return [ ( model , priority , i ) for ( i , ( model , priority ) ) in sorted ( zip ( range ( len ( models ) ) , models ) , key = lambda x : x [ 1 ] [ 1 ] ) ]
def cross_entropy ( model_interface , reference_interface , weights , score , mode , flags ) :
""" calculate cross entropy given all necessary information.
don ' t call this directly, but use one of the Combine_TMs methods.
"""
2012-05-31 16:23:38 +04:00
weights = normalize_weights ( weights , mode , flags )
2012-01-31 13:50:20 +04:00
if ' compare_cross-entropies ' in flags and flags [ ' compare_cross-entropies ' ] :
num_results = len ( model_interface . models )
else :
num_results = 1
2012-05-31 16:23:38 +04:00
cross_entropies = [ [ 0 ] * num_results for i in range ( model_interface . number_of_features ) ]
2012-01-31 13:50:20 +04:00
oov = [ 0 ] * num_results
oov2 = 0
other_translations = [ 0 ] * num_results
ignored = [ 0 ] * num_results
n = [ 0 ] * num_results
total_pairs = 0
for src in reference_interface . word_pairs :
for target in reference_interface . word_pairs [ src ] :
c = reference_interface . word_pairs [ src ] [ target ]
for i in range ( num_results ) :
if src in model_interface . phrase_pairs and target in model_interface . phrase_pairs [ src ] :
if ( ' compare_cross-entropies ' in flags and flags [ ' compare_cross-entropies ' ] ) or ( ' intersected_cross-entropies ' in flags and flags [ ' intersected_cross-entropies ' ] ) :
if 0 in model_interface . phrase_pairs [ src ] [ target ] [ 0 ] [ 0 ] : #only use intersection of models for comparability
# update unknown words statistics
if model_interface . phrase_pairs [ src ] [ target ] [ 0 ] [ 0 ] [ i ] :
ignored [ i ] + = c
elif src in model_interface . phrase_source and model_interface . phrase_source [ src ] [ i ] :
other_translations [ i ] + = c
else :
oov [ i ] + = c
continue
if ( ' compare_cross-entropies ' in flags and flags [ ' compare_cross-entropies ' ] ) :
tmp_weights = [ [ 0 ] * i + [ 1 ] + [ 0 ] * ( num_results - i - 1 ) ] * model_interface . number_of_features
elif ( ' intersected_cross-entropies ' in flags and flags [ ' intersected_cross-entropies ' ] ) :
tmp_weights = weights
features = score ( tmp_weights , src , target , model_interface , flags )
else :
features = score ( weights , src , target , model_interface , flags )
#if weight is so low that feature gets probability zero
if 0 in features :
#sys.stderr.write('Warning: 0 probability in model {0}: source phrase: {1!r}; target phrase: {2!r}\n'.format(i,src,target))
#sys.stderr.write('Possible reasons: 0 probability in phrase table; very low (or 0) weight; recompute lexweight and different alignments\n')
#sys.stderr.write('Phrase pair is ignored for cross_entropy calculation\n\n')
continue
n [ i ] + = c
2012-05-31 16:23:38 +04:00
for j in range ( model_interface . number_of_features ) :
cross_entropies [ j ] [ i ] - = log ( features [ j ] , 2 ) * c
2012-01-31 13:50:20 +04:00
elif src in model_interface . phrase_source and not ( ' compare_cross-entropies ' in flags and flags [ ' compare_cross-entropies ' ] ) :
other_translations [ i ] + = c
else :
oov2 + = c
total_pairs + = c
oov2 = int ( oov2 / num_results )
for i in range ( num_results ) :
try :
2012-05-31 16:23:38 +04:00
for j in range ( model_interface . number_of_features ) :
cross_entropies [ j ] [ i ] / = n [ i ]
2012-01-31 13:50:20 +04:00
except ZeroDivisionError :
sys . stderr . write ( ' Warning: no matching phrase pairs between reference set and model \n ' )
2012-05-31 16:23:38 +04:00
for j in range ( model_interface . number_of_features ) :
cross_entropies [ j ] [ i ] = 0
2012-01-31 13:50:20 +04:00
if ' compare_cross-entropies ' in flags and flags [ ' compare_cross-entropies ' ] :
2012-05-31 16:23:38 +04:00
return [ tuple ( [ ce [ i ] for ce in cross_entropies ] ) + ( other_translations [ i ] , oov [ i ] , ignored [ i ] , n [ i ] , total_pairs ) for i in range ( num_results ) ] , ( n [ 0 ] , total_pairs , oov2 )
2012-01-31 13:50:20 +04:00
else :
2012-05-31 16:23:38 +04:00
return tuple ( [ ce [ 0 ] for ce in cross_entropies ] ) + ( other_translations [ 0 ] , oov2 , total_pairs )
2012-01-31 13:50:20 +04:00
def cross_entropy_light ( model_interface , reference_interface , weights , score , mode , flags , cache ) :
""" calculate cross entropy given all necessary information.
don ' t call this directly, but use one of the Combine_TMs methods.
Same as cross_entropy , but optimized for speed : it doesn ' t generate all of the statistics,
doesn ' t normalize, and uses caching.
"""
2012-05-31 16:23:38 +04:00
weights = normalize_weights ( weights , mode , flags )
cross_entropies = [ 0 ] * model_interface . number_of_features
2012-01-31 13:50:20 +04:00
for ( src , target , c ) in cache :
features = score ( weights , src , target , model_interface , flags , cache = True )
2012-10-29 13:29:51 +04:00
if 0 in features :
#sys.stderr.write('Warning: 0 probability in model {0}: source phrase: {1!r}; target phrase: {2!r}\n'.format(i,src,target))
#sys.stderr.write('Possible reasons: 0 probability in phrase table; very low (or 0) weight; recompute lexweight and different alignments\n')
#sys.stderr.write('Phrase pair is ignored for cross_entropy calculation\n\n')
continue
2012-05-31 16:23:38 +04:00
for i in range ( model_interface . number_of_features ) :
cross_entropies [ i ] - = log ( features [ i ] , 2 ) * c
2012-10-29 13:29:51 +04:00
2012-05-31 16:23:38 +04:00
return cross_entropies
2012-01-31 13:50:20 +04:00
def _get_reference_cache ( reference_interface , model_interface ) :
""" creates a data structure that allows for a quick access
to all relevant reference set phrase / word pairs and their frequencies .
"""
cache = [ ]
n = 0
for src in reference_interface . word_pairs :
for target in reference_interface . word_pairs [ src ] :
if src in model_interface . phrase_pairs and target in model_interface . phrase_pairs [ src ] :
c = reference_interface . word_pairs [ src ] [ target ]
cache . append ( ( src , target , c ) )
n + = c
return cache , n
def _get_lexical_filter ( reference_interface , model_interface ) :
""" returns dictionaries that store the words and word pairs needed
for perplexity optimization . We can use these dicts to load fewer data into memory for optimization . """
e2f_filter = defaultdict ( set )
f2e_filter = defaultdict ( set )
for src in reference_interface . word_pairs :
for target in reference_interface . word_pairs [ src ] :
if src in model_interface . phrase_pairs and target in model_interface . phrase_pairs [ src ] :
e2f_alignment , f2e_alignment = model_interface . get_word_alignments ( src , target )
for s , t_list in e2f_alignment :
for t in t_list :
e2f_filter [ s ] . add ( t )
for t , s_list in f2e_alignment :
for s in s_list :
f2e_filter [ t ] . add ( s )
return e2f_filter , f2e_filter
2012-05-31 16:23:38 +04:00
def _hillclimb_move ( weights , stepsize , mode , flags ) :
2012-01-31 13:50:20 +04:00
""" Move function for hillclimb algorithm. Updates each weight by stepsize. """
for i , w in enumerate ( weights ) :
2012-05-31 16:23:38 +04:00
yield normalize_weights ( weights [ : i ] + [ w + stepsize ] + weights [ i + 1 : ] , mode , flags )
2012-01-31 13:50:20 +04:00
for i , w in enumerate ( weights ) :
new = w - stepsize
if new > = 1e-10 :
2012-05-31 16:23:38 +04:00
yield normalize_weights ( weights [ : i ] + [ new ] + weights [ i + 1 : ] , mode , flags )
2012-01-31 13:50:20 +04:00
def _hillclimb ( scores , best_weights , objective , model_interface , reference_interface , score_function , mode , flags , precision , cache , n ) :
""" first (deprecated) implementation of iterative weight optimization. """
best = objective ( best_weights )
i = 0 #counts number of iterations with same stepsize: if greater than 10, it is doubled
stepsize = 512 # initial stepsize
move = 1 #whether we found a better set of weights in the current iteration. if not, it is halfed
sys . stderr . write ( ' Hillclimb: step size: ' + str ( stepsize ) )
while stepsize > 0.0078 :
if not move :
stepsize / = 2
sys . stderr . write ( ' ' + str ( stepsize ) )
i = 0
move = 1
continue
move = 0
2012-05-31 16:23:38 +04:00
for w in _hillclimb_move ( list ( best_weights ) , stepsize , mode , flags ) :
2012-01-31 13:50:20 +04:00
weights_tuple = tuple ( w )
if weights_tuple in scores :
continue
scores [ weights_tuple ] = cross_entropy_light ( model_interface , reference_interface , [ w for m in range ( model_interface . number_of_features ) ] , score_function , mode , flags , cache )
if objective ( weights_tuple ) + precision < best :
best = objective ( weights_tuple )
best_weights = weights_tuple
move = 1
if i and not i % 10 :
sys . stderr . write ( ' \n Iteration ' + str ( i ) + ' with stepsize ' + str ( stepsize ) + ' . current cross-entropy: ' + str ( best ) + ' - weights: ' + str ( best_weights ) + ' ' )
stepsize * = 2
sys . stderr . write ( ' \n Increasing stepsize: ' + str ( stepsize ) )
i = 0
i + = 1
return best_weights
def optimize_cross_entropy_hillclimb ( model_interface , reference_interface , initial_weights , score_function , mode , flags , precision = 0.000001 ) :
""" find weights that minimize cross-entropy on a tuning set
deprecated ( default is now L - BFGS ( optimize_cross_entropy ) ) , but left in for people without SciPy
"""
scores = { }
best_weights = tuple ( initial_weights [ 0 ] )
cache , n = _get_reference_cache ( reference_interface , model_interface )
# each objective is a triple: which score to minimize from cross_entropy(), which weights to update accordingly, and a comment that is printed
2012-05-31 16:23:38 +04:00
objectives = [ ( lambda x : scores [ x ] [ i ] / n , [ i ] , ' minimize cross-entropy for feature {0} ' . format ( i ) ) for i in range ( model_interface . number_of_features ) ]
2012-01-31 13:50:20 +04:00
scores [ best_weights ] = cross_entropy_light ( model_interface , reference_interface , initial_weights , score_function , mode , flags , cache )
final_weights = initial_weights [ : ]
final_cross_entropy = [ 0 ] * model_interface . number_of_features
2012-05-31 16:23:38 +04:00
for i , ( objective , features , comment ) in enumerate ( objectives ) :
2012-01-31 13:50:20 +04:00
best_weights = min ( scores , key = objective )
sys . stderr . write ( ' Optimizing objective " ' + comment + ' " \n ' )
2012-05-31 16:23:38 +04:00
best_weights = _hillclimb ( scores , best_weights , objective , model_interface , reference_interface , score_function , feature_specific_mode ( mode , i , flags ) , flags , precision , cache , n )
2012-01-31 13:50:20 +04:00
2012-05-31 16:23:38 +04:00
sys . stderr . write ( ' \n Cross-entropy: ' + str ( objective ( best_weights ) ) + ' - weights: ' + str ( best_weights ) + ' \n \n ' )
2012-01-31 13:50:20 +04:00
for j in features :
final_weights [ j ] = list ( best_weights )
final_cross_entropy [ j ] = objective ( best_weights )
return final_weights , final_cross_entropy
def optimize_cross_entropy ( model_interface , reference_interface , initial_weights , score_function , mode , flags ) :
""" find weights that minimize cross-entropy on a tuning set
Uses L - BFGS optimization and requires SciPy
"""
if not optimizer == ' l-bfgs ' :
sys . stderr . write ( ' SciPy is not installed. Falling back to naive hillclimb optimization (instead of L-BFGS) \n ' )
return optimize_cross_entropy_hillclimb ( model_interface , reference_interface , initial_weights , score_function , mode , flags )
cache , n = _get_reference_cache ( reference_interface , model_interface )
# each objective is a triple: which score to minimize from cross_entropy(), which weights to update accordingly, and a comment that is printed
2012-05-31 16:23:38 +04:00
objectives = [ ( lambda w : cross_entropy_light ( model_interface , reference_interface , [ [ 1 ] + list ( w ) for m in range ( model_interface . number_of_features ) ] , score_function , feature_specific_mode ( mode , i , flags ) , flags , cache ) [ i ] , [ i ] , ' minimize cross-entropy for feature {0} ' . format ( i ) ) for i in range ( model_interface . number_of_features ) ] #optimize cross-entropy for p(s|t)
2012-01-31 13:50:20 +04:00
final_weights = initial_weights [ : ]
final_cross_entropy = [ 0 ] * model_interface . number_of_features
for i , ( objective , features , comment ) in enumerate ( objectives ) :
sys . stderr . write ( ' Optimizing objective " ' + comment + ' " \n ' )
initial_values = [ 1 ] * ( len ( model_interface . models ) - 1 ) # we leave value of first model at 1 and optimize all others (normalized of course)
best_weights , best_point , data = fmin_l_bfgs_b ( objective , initial_values , approx_grad = True , bounds = [ ( 0.000000001 , None ) ] * len ( initial_values ) )
2012-05-31 16:23:38 +04:00
best_weights = normalize_weights ( [ 1 ] + list ( best_weights ) , feature_specific_mode ( mode , i , flags ) , flags )
2012-01-31 13:50:20 +04:00
sys . stderr . write ( ' Cross-entropy after L-BFGS optimization: ' + str ( best_point / n ) + ' - weights: ' + str ( best_weights ) + ' \n ' )
for j in features :
final_weights [ j ] = list ( best_weights )
final_cross_entropy [ j ] = best_point / n
return final_weights , final_cross_entropy
2012-05-31 16:23:38 +04:00
def feature_specific_mode ( mode , i , flags ) :
""" in mode ' counts ' , only the default Moses features can be recomputed from raw frequencies;
all other features are interpolated by default .
This fucntion mostly serves optical purposes ( i . e . normalizing a single weight vector for logging ) ,
since normalize_weights also handles a mix of interpolated and recomputed features .
"""
if mode == ' counts ' and i not in [ flags [ ' i_e2f ' ] , flags [ ' i_e2f_lex ' ] , flags [ ' i_f2e ' ] , flags [ ' i_f2e_lex ' ] ] :
return ' interpolate '
else :
return mode
2012-01-31 13:50:20 +04:00
def redistribute_probability_mass ( weights , src , target , interface , flags , mode = ' interpolate ' ) :
""" the conditional probability p(x|y) is undefined for cases where p(y) = 0
this function redistributes the probability mass to only consider models for which p ( y ) > 0
"""
2012-05-31 16:23:38 +04:00
i_e2f = flags [ ' i_e2f ' ]
i_e2f_lex = flags [ ' i_e2f_lex ' ]
i_f2e = flags [ ' i_f2e ' ]
i_f2e_lex = flags [ ' i_f2e_lex ' ]
2012-01-31 13:50:20 +04:00
new_weights = weights [ : ]
if flags [ ' normalize_s_given_t ' ] == ' s ' :
# set weight to 0 for all models where target phrase is unseen (p(s|t)
2014-06-06 13:32:23 +04:00
new_weights [ i_e2f ] = list ( map ( mul , interface . phrase_source [ src ] , weights [ i_e2f ] ) )
2012-01-31 13:50:20 +04:00
if flags [ ' normalize-lexical_weights ' ] :
2014-06-06 13:32:23 +04:00
new_weights [ i_e2f_lex ] = list ( map ( mul , interface . phrase_source [ src ] , weights [ i_e2f_lex ] ) )
2012-01-31 13:50:20 +04:00
elif flags [ ' normalize_s_given_t ' ] == ' t ' :
# set weight to 0 for all models where target phrase is unseen (p(s|t)
2014-06-06 13:32:23 +04:00
new_weights [ i_e2f ] = list ( map ( mul , interface . phrase_target [ target ] , weights [ i_e2f ] ) )
2012-01-31 13:50:20 +04:00
if flags [ ' normalize-lexical_weights ' ] :
2014-06-06 13:32:23 +04:00
new_weights [ i_e2f_lex ] = list ( map ( mul , interface . phrase_target [ target ] , weights [ i_e2f_lex ] ) )
2012-01-31 13:50:20 +04:00
# set weight to 0 for all models where source phrase is unseen (p(t|s)
2014-06-06 13:32:23 +04:00
new_weights [ i_f2e ] = list ( map ( mul , interface . phrase_source [ src ] , weights [ i_f2e ] ) )
2012-01-31 13:50:20 +04:00
if flags [ ' normalize-lexical_weights ' ] :
2014-06-06 13:32:23 +04:00
new_weights [ i_f2e_lex ] = list ( map ( mul , interface . phrase_source [ src ] , weights [ i_f2e_lex ] ) )
2012-01-31 13:50:20 +04:00
2012-05-31 16:23:38 +04:00
return normalize_weights ( new_weights , mode , flags )
2012-01-31 13:50:20 +04:00
def score_interpolate ( weights , src , target , interface , flags , cache = False ) :
""" linear interpolation of probabilites (and other feature values)
if normalized is True , the probability mass for p ( x | y ) is redistributed to models with p ( y ) > 0
"""
2012-05-31 16:23:38 +04:00
2012-01-31 13:50:20 +04:00
model_values = interface . phrase_pairs [ src ] [ target ] [ 0 ]
scores = [ 0 ] * len ( model_values )
if ' normalized ' in flags and flags [ ' normalized ' ] :
normalized_weights = redistribute_probability_mass ( weights , src , target , interface , flags )
else :
normalized_weights = weights
if ' recompute_lexweights ' in flags and flags [ ' recompute_lexweights ' ] :
e2f_alignment , f2e_alignment = interface . get_word_alignments ( src , target , cache = cache )
if not e2f_alignment or not f2e_alignment :
sys . stderr . write ( ' Error: no word alignments found, but necessary for lexical weight computation. \n ' )
lst = 0
lts = 0
else :
2012-05-31 16:23:38 +04:00
scores [ flags [ ' i_e2f_lex ' ] ] = compute_lexicalweight ( normalized_weights [ flags [ ' i_e2f_lex ' ] ] , e2f_alignment , interface . word_pairs_e2f , None , mode = ' interpolate ' )
scores [ flags [ ' i_f2e_lex ' ] ] = compute_lexicalweight ( normalized_weights [ flags [ ' i_f2e_lex ' ] ] , f2e_alignment , interface . word_pairs_f2e , None , mode = ' interpolate ' )
2012-01-31 13:50:20 +04:00
for idx , prob in enumerate ( model_values ) :
2012-05-31 16:23:38 +04:00
if not ( ' recompute_lexweights ' in flags and flags [ ' recompute_lexweights ' ] and ( idx == flags [ ' i_e2f_lex ' ] or idx == flags [ ' i_f2e_lex ' ] ) ) :
2012-01-31 13:50:20 +04:00
scores [ idx ] = dot_product ( prob , normalized_weights [ idx ] )
return scores
def score_loglinear ( weights , src , target , interface , flags , cache = False ) :
""" loglinear interpolation of probabilites
warning : if phrase pair does not occur in all models , resulting probability is 0
this is usually not what you want - loglinear scoring is only included for completeness ' sake
"""
scores = [ ]
model_values = interface . phrase_pairs [ src ] [ target ] [ 0 ]
for idx , prob in enumerate ( model_values ) :
try :
2014-06-06 13:32:23 +04:00
scores . append ( exp ( dot_product ( list ( map ( log , prob ) ) , weights [ idx ] ) ) )
2012-01-31 13:50:20 +04:00
except ValueError :
scores . append ( 0 )
return scores
def score_counts ( weights , src , target , interface , flags , cache = False ) :
""" count-based re-estimation of probabilites and lexical weights
each count is multiplied by its weight ; trivial case is weight 1 for each model , which corresponds to a concatentation
"""
2012-05-31 16:23:38 +04:00
i_e2f = flags [ ' i_e2f ' ]
i_e2f_lex = flags [ ' i_e2f_lex ' ]
i_f2e = flags [ ' i_f2e ' ]
i_f2e_lex = flags [ ' i_f2e_lex ' ]
# if we have non-default number of weights, assume that we might have to do a mix of count-based and interpolated scores.
if len ( weights ) == 4 :
scores = [ 0 ] * len ( weights )
else :
scores = score_interpolate ( weights , src , target , interface , flags , cache = cache )
2012-01-31 13:50:20 +04:00
try :
2012-05-31 16:23:38 +04:00
joined_count = dot_product ( interface . phrase_pairs [ src ] [ target ] [ 0 ] [ i_e2f ] , weights [ i_e2f ] )
target_count = dot_product ( interface . phrase_target [ target ] , weights [ i_e2f ] )
scores [ i_e2f ] = joined_count / target_count
2012-01-31 13:50:20 +04:00
except ZeroDivisionError :
2012-05-31 16:23:38 +04:00
scores [ i_e2f ] = 0
2012-01-31 13:50:20 +04:00
try :
2012-05-31 16:23:38 +04:00
joined_count = dot_product ( interface . phrase_pairs [ src ] [ target ] [ 0 ] [ i_f2e ] , weights [ i_f2e ] )
source_count = dot_product ( interface . phrase_source [ src ] , weights [ i_f2e ] )
scores [ i_f2e ] = joined_count / source_count
2012-01-31 13:50:20 +04:00
except ZeroDivisionError :
2012-05-31 16:23:38 +04:00
scores [ i_f2e ] = 0
2012-01-31 13:50:20 +04:00
e2f_alignment , f2e_alignment = interface . get_word_alignments ( src , target , cache = cache )
if not e2f_alignment or not f2e_alignment :
sys . stderr . write ( ' Error: no word alignments found, but necessary for lexical weight computation. \n ' )
2012-05-31 16:23:38 +04:00
scores [ i_e2f_lex ] = 0
scores [ i_f2e_lex ] = 0
2012-01-31 13:50:20 +04:00
else :
2012-05-31 16:23:38 +04:00
scores [ i_e2f_lex ] = compute_lexicalweight ( weights [ i_e2f_lex ] , e2f_alignment , interface . word_pairs_e2f , interface . word_target , mode = ' counts ' , cache = cache )
scores [ i_f2e_lex ] = compute_lexicalweight ( weights [ i_f2e_lex ] , f2e_alignment , interface . word_pairs_f2e , interface . word_source , mode = ' counts ' , cache = cache )
2012-01-31 13:50:20 +04:00
2012-05-31 16:23:38 +04:00
return scores
2012-01-31 13:50:20 +04:00
def score_interpolate_reordering ( weights , src , target , interface ) :
""" linear interpolation of reordering model probabilities
also normalizes model so that
"""
model_values = interface . reordering_pairs [ src ] [ target ]
scores = [ 0 ] * len ( model_values )
for idx , prob in enumerate ( model_values ) :
scores [ idx ] = dot_product ( prob , weights [ idx ] )
#normalizes first half and last half probabilities (so that each half sums to one).
#only makes sense for bidirectional configuration in Moses. Remove/change this if you want a different (or no) normalization
scores = normalize_weights ( scores [ : int ( interface . number_of_features / 2 ) ] , ' interpolate ' ) + normalize_weights ( scores [ int ( interface . number_of_features / 2 ) : ] , ' interpolate ' )
return scores
def compute_lexicalweight ( weights , alignment , word_pairs , marginal , mode = ' counts ' , cache = False , mycache = [ 0 , defaultdict ( dict ) ] ) :
""" compute the lexical weights as implemented in Moses toolkit """
lex = 1
# new weights: empty cache
if cache and mycache [ 0 ] != weights :
mycache [ 0 ] = weights
mycache [ 1 ] = defaultdict ( dict )
for x , translations in alignment :
2014-10-12 20:23:30 +04:00
# skip nonterminals
if x . startswith ( b ' [ ' ) :
continue
2012-01-31 13:50:20 +04:00
if cache and translations in mycache [ 1 ] [ x ] :
lex_step = mycache [ 1 ] [ x ] [ translations ]
else :
lex_step = 0
for y in translations :
if mode == ' counts ' :
lex_step + = dot_product ( word_pairs [ x ] [ y ] , weights ) / dot_product ( marginal [ y ] , weights )
elif mode == ' interpolate ' :
lex_step + = dot_product ( word_pairs [ x ] [ y ] , weights )
lex_step / = len ( translations )
if cache :
mycache [ 1 ] [ x ] [ translations ] = lex_step
lex * = lex_step
return lex
2012-05-31 16:23:38 +04:00
def normalize_weights ( weights , mode , flags = None ) :
2012-01-31 13:50:20 +04:00
""" make sure that probability mass in linear interpolation is 1
for weighted counts , weight of first model is set to 1
"""
if mode == ' interpolate ' or mode == ' loglinear ' :
if type ( weights [ 0 ] ) == list :
new_weights = [ ]
for weight_list in weights :
total = sum ( weight_list )
try :
weight_list = [ weight / total for weight in weight_list ]
except ZeroDivisionError :
sys . stderr . write ( ' Error: Zero division in weight normalization. Are some of your weights zero? This might lead to undefined behaviour if a phrase pair is only seen in model with weight 0 \n ' )
new_weights . append ( weight_list )
else :
total = sum ( weights )
try :
new_weights = [ weight / total for weight in weights ]
except ZeroDivisionError :
sys . stderr . write ( ' Error: Zero division in weight normalization. Are some of your weights zero? This might lead to undefined behaviour if a phrase pair is only seen in model with weight 0 \n ' )
2012-05-31 16:23:38 +04:00
elif mode == ' counts_pure ' :
2012-01-31 13:50:20 +04:00
if type ( weights [ 0 ] ) == list :
new_weights = [ ]
for weight_list in weights :
ratio = 1 / weight_list [ 0 ]
new_weights . append ( [ weight * ratio for weight in weight_list ] )
else :
ratio = 1 / weights [ 0 ]
new_weights = [ weight * ratio for weight in weights ]
2012-05-31 16:23:38 +04:00
# make sure that features other than the standard Moses features are always interpolated (since no count-based computation is defined)
elif mode == ' counts ' :
if type ( weights [ 0 ] ) == list :
norm_counts = normalize_weights ( weights , ' counts_pure ' )
new_weights = normalize_weights ( weights , ' interpolate ' )
for i in [ flags [ ' i_e2f ' ] , flags [ ' i_e2f_lex ' ] , flags [ ' i_f2e ' ] , flags [ ' i_f2e_lex ' ] ] :
new_weights [ i ] = norm_counts [ i ]
return new_weights
else :
return normalize_weights ( weights , ' counts_pure ' )
2012-01-31 13:50:20 +04:00
return new_weights
def handle_file ( filename , action , fileobj = None , mode = ' r ' ) :
2012-04-17 16:03:41 +04:00
""" support reading/writing either from/to file, stdout or gzipped file """
2012-01-31 13:50:20 +04:00
if action == ' open ' :
if mode == ' r ' :
mode = ' rb '
2014-06-06 13:32:23 +04:00
elif mode == ' w ' :
mode = ' wb '
2012-01-31 13:50:20 +04:00
if mode == ' rb ' and not filename == ' - ' and not os . path . exists ( filename ) :
if os . path . exists ( filename + ' .gz ' ) :
filename = filename + ' .gz '
else :
sys . stderr . write ( ' Error: unable to open file. ' + filename + ' - aborting. \n ' )
2013-01-11 14:24:41 +04:00
if ' counts ' in filename and os . path . exists ( os . path . dirname ( filename ) ) :
2012-01-31 13:50:20 +04:00
sys . stderr . write ( ' For a weighted counts combination, we need statistics that Moses doesn \' t write to disk by default. \n ' )
2012-09-06 13:48:54 +04:00
sys . stderr . write ( ' Repeat step 4 of Moses training for all models with the option -write-lexical-counts. \n ' )
2012-01-31 13:50:20 +04:00
2014-01-30 18:42:31 +04:00
exit ( 1 )
2012-01-31 13:50:20 +04:00
if filename . endswith ( ' .gz ' ) :
fileobj = gzip . open ( filename , mode )
2014-06-06 13:32:23 +04:00
elif filename == ' - ' and mode == ' wb ' :
2012-01-31 13:50:20 +04:00
fileobj = sys . stdout
else :
fileobj = open ( filename , mode )
return fileobj
elif action == ' close ' and filename != ' - ' :
fileobj . close ( )
2012-02-27 17:11:47 +04:00
def sort_file ( filename , tempdir = None ) :
2012-01-31 13:50:20 +04:00
""" Sort a file and return temporary file """
cmd = [ ' sort ' , filename ]
env = { }
env [ ' LC_ALL ' ] = ' C '
2012-02-28 13:05:30 +04:00
if tempdir :
cmd . extend ( [ ' -T ' , tempdir ] )
2012-01-31 13:50:20 +04:00
2012-02-27 17:11:47 +04:00
outfile = NamedTemporaryFile ( delete = False , dir = tempdir )
2012-01-31 13:50:20 +04:00
sys . stderr . write ( ' LC_ALL=C ' + ' ' . join ( cmd ) + ' > ' + outfile . name + ' \n ' )
p = Popen ( cmd , env = env , stdout = outfile . file )
p . wait ( )
outfile . seek ( 0 )
return outfile
class Combine_TMs ( ) :
""" This class handles the various options, checks them for sanity and has methods that define what models to load and what functions to call for the different tasks.
Typically , you only need to interact with this class and its attributes .
"""
#some flags that change the behaviour during scoring. See init docstring for more info
flags = { ' normalized ' : False ,
' recompute_lexweights ' : False ,
' intersected_cross-entropies ' : False ,
' normalize_s_given_t ' : None ,
' normalize-lexical_weights ' : True ,
' add_origin_features ' : False ,
2013-10-25 00:12:47 +04:00
' write_phrase_penalty ' : False ,
2012-05-31 16:23:38 +04:00
' lowmem ' : False ,
' i_e2f ' : 0 ,
' i_e2f_lex ' : 1 ,
' i_f2e ' : 2 ,
' i_f2e_lex ' : 3
2012-01-31 13:50:20 +04:00
}
# each model needs a priority. See init docstring for more info
_priorities = { ' primary ' : 1 ,
' map ' : 2 ,
' supplementary ' : 10 }
2012-05-31 16:23:38 +04:00
def __init__ ( self , models , weights = None ,
output_file = None ,
mode = ' interpolate ' ,
number_of_features = 4 ,
model_interface = Moses ,
reference_interface = Moses_Alignment ,
reference_file = None ,
lang_src = None ,
lang_target = None ,
output_lexical = None ,
* * flags ) :
2012-01-31 13:50:20 +04:00
""" The whole configuration of the task is done during intialization. Afterwards, you only need to call your intended method(s).
You can change some of the class attributes afterwards ( such as the weights , or the output file ) , but you should never change the models or mode after initialization .
See unit_test function for example configurations
models : list of tuples ( path , priority ) that defines which models to process . Path is usually the top directory of a Moses model . There are three priorities :
' primary ' : phrase pairs with this priority will always be included in output model . For most purposes , you ' ll want to define all models as primary.
' map ' : for maximum a - posteriori combination ( Bacchiani et al . 2004 ; Foster et al . 2010 ) . for use with mode ' counts ' . stores c ( t ) = 1 and c ( s , t ) = p ( s | t )
' supplementary ' : phrase pairs are considered for probability computation , but not included in output model ( unless they also occur in at least one primary model )
useful for rescoring a model without changing its vocabulary .
weights : accept two types of weight declarations : one weight per model , and one weight per model and feature
type one is internally converted to type two . For 2 models with four features , this looks like : [ 0.1 , 0.9 ] - > [ [ 0.1 , 0.9 ] , [ 0.1 , 0.9 ] , [ 0.1 , 0.9 ] , [ 0.1 , 0.9 ] ]
default : uniform weights ( None )
output_file : filepath of output phrase table . If it ends with . gz , file is automatically zipped .
output_lexical : If defined , also writes combined lexical tables . Writes to output_lexical . e2f and output_lexical . f2e , or output_lexical . counts . e2f in mode ' counts ' .
mode : declares the basic mixture - model algorithm . there are currently three options :
2012-09-06 13:48:54 +04:00
' counts ' : weighted counts ( requires some statistics that Moses doesn ' t produce. Repeat step 4 of Moses training with the option -write-lexical-counts to obtain them.)
2012-05-31 16:23:38 +04:00
Only the standard Moses features are recomputed from weighted counts ; additional features are linearly interpolated
( see number_of_features to allow more features , and i_e2f etc . if the standard features are in a non - standard position )
2012-01-31 13:50:20 +04:00
' interpolate ' : linear interpolation
' loglinear ' : loglinear interpolation ( careful : this creates the intersection of phrase tables and is often of little use )
number_of_features : could be used to interpolate models with non - default Moses features . 4 features is currently still hardcoded in various places
( e . g . cross_entropy calculations , mode ' counts ' )
2012-05-31 16:23:38 +04:00
i_e2f , i_e2f_lex , i_f2e , i_f2e_lex : Index of the ( Moses ) phrase table features p ( s | t ) , lex ( s | t ) , p ( t | s ) and lex ( t | s ) .
Relevant for mode ' counts ' , and if ' recompute_lexweights ' is True in mode ' interpolate ' . In mode ' counts ' , any additional features are combined through linear interpolation .
2012-01-31 13:50:20 +04:00
model_interface : class that handles reading phrase tables and lexical tables , and writing phrase tables . Currently only Moses is implemented .
default : Moses
reference_interace : class that deals with reading in reference phrase pairs for cross - entropy computation
Moses_Alignment : Word / phrase pairs as computed by Giza + + and extracted through Moses heuristics . This corresponds to the file model / extract . gz if you train a Moses model on your tuning set .
TigerXML : TigerXML data format
default : Moses_Alignment
reference_file : path to reference file . Required for every operation except combination of models with given weights .
lang_src : source language . Only required if reference_interface is TigerXML . Identifies which language in XML file we should treat as source language .
lang_target : target language . Only required if reference_interface is TigerXML . Identifies which language in XML file we should treat as target language .
intersected_cross - entropies : compute cross - entropies of intersection of phrase pairs , ignoring phrase pairs that do not occur in all models .
If False , algorithm operates on union of phrase pairs
default : False
add_origin_features : For each model that is being combined , add a binary feature to the final phrase table , with values of 1 ( phrase pair doesn ' t occur in model) and 2.718 (it does).
This indicates which model ( s ) a phrase pair comes from and can be used during MERT to additionally reward / penalize translation models
lowmem : low memory mode : instead of loading target phrase counts / probability ( when required ) , process the original table and its inversion ( source and target swapped ) incrementally , then merge the two halves .
2012-02-27 17:11:47 +04:00
tempdir : temporary directory ( for low memory mode ) .
2012-01-31 13:50:20 +04:00
there are a number of further configuration options that you can define , which modify the algorithm for linear interpolation . They have no effect in mode ' counts '
recompute_lexweights : don ' t directly interpolate lexical weights, but interpolate word translation probabilities instead and recompute the lexical weights.
default : False
normalized : for interpolation of p ( x | y ) : if True , models with p ( y ) = 0 will be ignored , and probability mass will be distributed among models with p ( y ) > 0.
If False , missing entries ( x , y ) are always interpreted as p ( x | y ) = 0.
default : False
normalize_s_given_t : How to we normalize p ( s | t ) if ' normalized ' is True ? Three options :
None : don ' t normalize p(s|t) and lex(s|t) (only p(t|s) and lex(t|s))
t : check if p ( t ) == 0 : advantage : theoretically sound ; disadvantage : slower ( we need to know if t occcurs in model ) ; favours rare target phrases ( relative to default choice )
s : check if p ( s ) == 0 : advantage : relevant for task ; disadvantage : no true probability distributions
default : None
normalize - lexical_weights : also normalize lex ( s | t ) and lex ( t | s ) if ' normalized ' ist True :
reason why you might want to disable this : lexical weights suffer less from data sparseness than probabilities .
default : True
"""
self . mode = mode
self . output_file = output_file
self . lang_src = lang_src
self . lang_target = lang_target
self . loaded = defaultdict ( int )
self . output_lexical = output_lexical
2012-05-31 16:23:38 +04:00
self . flags = copy . copy ( self . flags )
self . flags . update ( flags )
2012-01-31 13:50:20 +04:00
2012-05-31 16:23:38 +04:00
self . flags [ ' i_e2f ' ] = int ( self . flags [ ' i_e2f ' ] )
self . flags [ ' i_e2f_lex ' ] = int ( self . flags [ ' i_e2f_lex ' ] )
self . flags [ ' i_f2e ' ] = int ( self . flags [ ' i_f2e ' ] )
self . flags [ ' i_f2e_lex ' ] = int ( self . flags [ ' i_f2e_lex ' ] )
2012-01-31 13:50:20 +04:00
if reference_interface :
self . reference_interface = reference_interface ( reference_file )
if mode not in [ ' interpolate ' , ' loglinear ' , ' counts ' ] :
sys . stderr . write ( ' Error: mode must be either " interpolate " , " loglinear " or " counts " \n ' )
2014-01-30 18:42:31 +04:00
sys . exit ( 1 )
2012-01-31 13:50:20 +04:00
models , number_of_features , weights = self . _sanity_checks ( models , number_of_features , weights )
self . weights = weights
self . models = models
self . model_interface = model_interface ( models , number_of_features )
if mode == ' interpolate ' :
self . score = score_interpolate
elif mode == ' loglinear ' :
self . score = score_loglinear
elif mode == ' counts ' :
self . score = score_counts
def _sanity_checks ( self , models , number_of_features , weights ) :
""" check if input arguments make sense (correct number of weights, valid model priorities etc.)
is only called on initialization . If you change weights afterwards , better know what you ' re doing.
"""
2012-05-31 16:23:38 +04:00
number_of_features = int ( number_of_features )
2012-01-31 13:50:20 +04:00
for ( model , priority ) in models :
assert ( priority in self . _priorities )
models = [ ( model , self . _priorities [ p ] ) for ( model , p ) in models ]
# accept two types of weight declarations: one weight per model, and one weight per model and feature
# type one is internally converted to type two: [0.1,0.9] -> [[0.1,0.9],[0.1,0.9],[0.1,0.9],[0.1,0.9]]
if weights :
if type ( weights [ 0 ] ) == list :
assert ( len ( weights ) == number_of_features )
for sublist in weights :
assert ( len ( sublist ) == len ( models ) )
else :
assert ( len ( models ) == len ( weights ) )
weights = [ weights for i in range ( number_of_features ) ]
else :
if self . mode == ' loglinear ' or self . mode == ' interpolate ' :
weights = [ [ 1 / len ( models ) ] * len ( models ) for i in range ( number_of_features ) ]
elif self . mode == ' counts ' :
weights = [ [ 1 ] * len ( models ) for i in range ( number_of_features ) ]
sys . stderr . write ( ' Warning: No weights defined: initializing with uniform weights \n ' )
2012-05-31 16:23:38 +04:00
new_weights = normalize_weights ( weights , self . mode , self . flags )
2012-01-31 13:50:20 +04:00
if weights != new_weights :
if self . mode == ' interpolate ' or self . mode == ' loglinear ' :
sys . stderr . write ( ' Warning: weights should sum to 1 - ' )
elif self . mode == ' counts ' :
2012-05-31 16:23:38 +04:00
sys . stderr . write ( ' Warning: normalizing weights so that first model has weight 1 (for features that are recomputed from counts) - ' )
2012-01-31 13:50:20 +04:00
sys . stderr . write ( ' normalizing to: ' + str ( new_weights ) + ' \n ' )
weights = new_weights
return models , number_of_features , weights
def _ensure_loaded ( self , data ) :
""" load data (lexical tables; reference alignment; phrase table), if it isn ' t already in memory """
if ' lexical ' in data :
self . model_interface . require_alignment = True
if ' reference ' in data and not self . loaded [ ' reference ' ] :
sys . stderr . write ( ' Loading word pairs from reference set... ' )
self . reference_interface . load_word_pairs ( self . lang_src , self . lang_target )
sys . stderr . write ( ' done \n ' )
self . loaded [ ' reference ' ] = 1
if ' lexical ' in data and not self . loaded [ ' lexical ' ] :
sys . stderr . write ( ' Loading lexical tables... ' )
self . model_interface . load_lexical_tables ( self . models , self . mode )
sys . stderr . write ( ' done \n ' )
self . loaded [ ' lexical ' ] = 1
if ' pt-filtered ' in data and not self . loaded [ ' pt-filtered ' ] :
models_prioritized = [ ( self . model_interface . open_table ( model , ' phrase-table ' ) , priority , i ) for ( model , priority , i ) in priority_sort_models ( self . models ) ]
2012-05-31 16:23:38 +04:00
2012-01-31 13:50:20 +04:00
for model , priority , i in models_prioritized :
sys . stderr . write ( ' Loading phrase table ' + str ( i ) + ' (only data relevant for reference set) ' )
j = 0
for line in model :
if not j % 1000000 :
sys . stderr . write ( ' ... ' + str ( j ) )
j + = 1
line = line . rstrip ( ) . split ( b ' ||| ' )
2014-03-27 18:32:13 +04:00
if line [ - 1 ] . endswith ( b ' ||| ' ) :
line [ - 1 ] = line [ - 1 ] [ : - 4 ]
line . append ( ' ' )
2012-05-31 16:23:38 +04:00
self . model_interface . load_phrase_features ( line , priority , i , store = ' all ' , mode = self . mode , filter_by = self . reference_interface . word_pairs , filter_by_src = self . reference_interface . word_source , filter_by_target = self . reference_interface . word_target , flags = self . flags )
2012-01-31 13:50:20 +04:00
sys . stderr . write ( ' done \n ' )
self . loaded [ ' pt-filtered ' ] = 1
if ' lexical-filtered ' in data and not self . loaded [ ' lexical-filtered ' ] :
e2f_filter , f2e_filter = _get_lexical_filter ( self . reference_interface , self . model_interface )
sys . stderr . write ( ' Loading lexical tables (only data relevant for reference set)... ' )
self . model_interface . load_lexical_tables ( self . models , self . mode , e2f_filter = e2f_filter , f2e_filter = f2e_filter )
sys . stderr . write ( ' done \n ' )
self . loaded [ ' lexical-filtered ' ] = 1
if ' pt-target ' in data and not self . loaded [ ' pt-target ' ] :
models_prioritized = [ ( self . model_interface . open_table ( model , ' phrase-table ' ) , priority , i ) for ( model , priority , i ) in priority_sort_models ( self . models ) ]
for model , priority , i in models_prioritized :
sys . stderr . write ( ' Loading target information from phrase table ' + str ( i ) )
j = 0
for line in model :
if not j % 1000000 :
sys . stderr . write ( ' ... ' + str ( j ) )
j + = 1
line = line . rstrip ( ) . split ( b ' ||| ' )
2014-03-27 18:32:13 +04:00
if line [ - 1 ] . endswith ( b ' ||| ' ) :
line [ - 1 ] = line [ - 1 ] [ : - 4 ]
line . append ( ' ' )
2012-05-31 16:23:38 +04:00
self . model_interface . load_phrase_features ( line , priority , i , mode = self . mode , store = ' target ' , flags = self . flags )
2012-01-31 13:50:20 +04:00
sys . stderr . write ( ' done \n ' )
self . loaded [ ' pt-target ' ] = 1
2012-02-27 17:11:47 +04:00
def _inverse_wrapper ( self , weights , tempdir = None ) :
2012-01-31 13:50:20 +04:00
""" if we want to invert the phrase table to better calcualte p(s|t) and lex(s|t), manage creation, sorting and merging of inverted phrase tables """
sys . stderr . write ( ' Processing first table half \n ' )
models = [ ( self . model_interface . open_table ( model , ' phrase-table ' ) , priority , i ) for ( model , priority , i ) in priority_sort_models ( self . model_interface . models ) ]
2012-02-27 17:11:47 +04:00
pt_half1 = NamedTemporaryFile ( prefix = ' half1 ' , delete = False , dir = tempdir )
2012-01-31 13:50:20 +04:00
self . _write_phrasetable ( models , pt_half1 , weights )
pt_half1 . seek ( 0 )
sys . stderr . write ( ' Inverting tables \n ' )
2012-02-27 17:11:47 +04:00
models = [ ( self . model_interface . create_inverse ( self . model_interface . open_table ( model , ' phrase-table ' ) , tempdir = tempdir ) , priority , i ) for ( model , priority , i ) in priority_sort_models ( self . model_interface . models ) ]
2012-01-31 13:50:20 +04:00
sys . stderr . write ( ' Processing second table half \n ' )
2012-02-27 17:11:47 +04:00
pt_half2_inverted = NamedTemporaryFile ( prefix = ' half2 ' , delete = False , dir = tempdir )
2012-01-31 13:50:20 +04:00
self . _write_phrasetable ( models , pt_half2_inverted , weights , inverted = True )
pt_half2_inverted . close ( )
for model , priority , i in models :
model . close ( )
os . remove ( model . name )
2012-02-27 17:11:47 +04:00
pt_half2 = sort_file ( pt_half2_inverted . name , tempdir = tempdir )
2012-01-31 13:50:20 +04:00
os . remove ( pt_half2_inverted . name )
sys . stderr . write ( ' Merging tables: first half: {0} ; second half: {1} ; final table: {2} \n ' . format ( pt_half1 . name , pt_half2 . name , self . output_file ) )
output_object = handle_file ( self . output_file , ' open ' , mode = ' w ' )
self . model_interface . merge ( pt_half1 , pt_half2 , output_object , self . mode )
os . remove ( pt_half1 . name )
os . remove ( pt_half2 . name )
handle_file ( self . output_file , ' close ' , output_object , mode = ' w ' )
def _write_phrasetable ( self , models , output_object , weights , inverted = False ) :
""" Incrementally load phrase tables, calculate score for increment and write it to output_object """
# define which information we need to store from the phrase table
# possible flags: 'all', 'target', 'source' and 'pairs'
# interpolated models without re-normalization only need 'pairs', otherwise 'all' is the correct choice
store_flag = ' all '
if self . mode == ' interpolate ' and not self . flags [ ' normalized ' ] :
store_flag = ' pairs '
i = 0
sys . stderr . write ( ' Incrementally loading and processing phrase tables... ' )
2012-05-31 16:23:38 +04:00
for block in self . model_interface . traverse_incrementally ( ' phrase-table ' , models , self . model_interface . load_phrase_features , store_flag , mode = self . mode , inverted = inverted , lowmem = self . flags [ ' lowmem ' ] , flags = self . flags ) :
2012-01-31 13:50:20 +04:00
for src in sorted ( self . model_interface . phrase_pairs , key = lambda x : x + b ' | ' ) :
for target in sorted ( self . model_interface . phrase_pairs [ src ] , key = lambda x : x + b ' | ' ) :
if not i % 1000000 :
sys . stderr . write ( str ( i ) + ' ... ' )
i + = 1
features = self . score ( weights , src , target , self . model_interface , self . flags )
outline = self . model_interface . write_phrase_table ( src , target , weights , features , self . mode , self . flags )
output_object . write ( outline )
sys . stderr . write ( ' done \n ' )
def combine_given_weights ( self , weights = None ) :
""" write a new phrase table, based on existing weights """
if not weights :
weights = self . weights
data = [ ]
if self . mode == ' counts ' :
data . append ( ' lexical ' )
if not self . flags [ ' lowmem ' ] :
data . append ( ' pt-target ' )
elif self . mode == ' interpolate ' :
if self . flags [ ' recompute_lexweights ' ] :
data . append ( ' lexical ' )
if self . flags [ ' normalized ' ] and self . flags [ ' normalize_s_given_t ' ] == ' t ' and not self . flags [ ' lowmem ' ] :
data . append ( ' pt-target ' )
self . _ensure_loaded ( data )
if self . flags [ ' lowmem ' ] and ( self . mode == ' counts ' or self . flags [ ' normalized ' ] and self . flags [ ' normalize_s_given_t ' ] == ' t ' ) :
2012-02-27 17:11:47 +04:00
self . _inverse_wrapper ( weights , tempdir = self . flags [ ' tempdir ' ] )
2012-01-31 13:50:20 +04:00
else :
models = [ ( self . model_interface . open_table ( model , ' phrase-table ' ) , priority , i ) for ( model , priority , i ) in priority_sort_models ( self . model_interface . models ) ]
output_object = handle_file ( self . output_file , ' open ' , mode = ' w ' )
self . _write_phrasetable ( models , output_object , weights )
handle_file ( self . output_file , ' close ' , output_object , mode = ' w ' )
if self . output_lexical :
sys . stderr . write ( ' Writing lexical tables \n ' )
self . _ensure_loaded ( [ ' lexical ' ] )
self . model_interface . write_lexical_file ( ' e2f ' , self . output_lexical , weights [ 1 ] , self . mode )
self . model_interface . write_lexical_file ( ' f2e ' , self . output_lexical , weights [ 3 ] , self . mode )
def combine_given_tuning_set ( self ) :
""" write a new phrase table, using the weights that minimize cross-entropy on a tuning set """
data = [ ' reference ' , ' pt-filtered ' ]
if self . mode == ' counts ' or ( self . mode == ' interpolate ' and self . flags [ ' recompute_lexweights ' ] ) :
data . append ( ' lexical-filtered ' )
self . _ensure_loaded ( data )
best_weights , best_cross_entropy = optimize_cross_entropy ( self . model_interface , self . reference_interface , self . weights , self . score , self . mode , self . flags )
sys . stderr . write ( ' Best weights: ' + str ( best_weights ) + ' \n ' )
sys . stderr . write ( ' Cross entropies: ' + str ( best_cross_entropy ) + ' \n ' )
2012-04-17 16:03:41 +04:00
sys . stderr . write ( ' Executing action combine_given_weights with -w " {0} " \n ' . format ( ' ; ' . join ( [ ' , ' . join ( str ( w ) for w in item ) for item in best_weights ] ) ) )
2012-01-31 13:50:20 +04:00
self . loaded [ ' pt-filtered ' ] = False # phrase table will be overwritten
self . combine_given_weights ( weights = best_weights )
def combine_reordering_tables ( self , weights = None ) :
""" write a new reordering table, based on existing weights. """
if not weights :
weights = self . weights
data = [ ]
if self . mode != ' interpolate ' :
sys . stderr . write ( ' Error: only linear interpolation is supported for reordering model combination ' )
output_object = handle_file ( self . output_file , ' open ' , mode = ' w ' )
2012-02-27 17:11:47 +04:00
models = [ ( self . model_interface . open_table ( model , ' reordering-table ' ) , priority , i ) for ( model , priority , i ) in priority_sort_models ( self . models ) ]
2012-01-31 13:50:20 +04:00
i = 0
sys . stderr . write ( ' Incrementally loading and processing phrase tables... ' )
2012-05-31 16:23:38 +04:00
for block in self . model_interface . traverse_incrementally ( ' reordering-table ' , models , self . model_interface . load_reordering_probabilities , ' pairs ' , mode = self . mode , lowmem = self . flags [ ' lowmem ' ] , flags = self . flags ) :
2012-01-31 13:50:20 +04:00
for src in sorted ( self . model_interface . reordering_pairs ) :
for target in sorted ( self . model_interface . reordering_pairs [ src ] ) :
if not i % 1000000 :
sys . stderr . write ( str ( i ) + ' ... ' )
i + = 1
features = score_interpolate_reordering ( weights , src , target , self . model_interface )
outline = self . model_interface . write_reordering_table ( src , target , features )
output_object . write ( outline )
sys . stderr . write ( ' done \n ' )
handle_file ( self . output_file , ' close ' , output_object , mode = ' w ' )
def compare_cross_entropies ( self ) :
""" print cross-entropies for each model/feature, using the intersection of phrase pairs.
analysis tool .
"""
self . flags [ ' compare_cross-entropies ' ] = True
data = [ ' reference ' , ' pt-filtered ' ]
if self . mode == ' counts ' or ( self . mode == ' interpolate ' and self . flags [ ' recompute_lexweights ' ] ) :
data . append ( ' lexical-filtered ' )
self . _ensure_loaded ( data )
results , ( intersection , total_pairs , oov2 ) = cross_entropy ( self . model_interface , self . reference_interface , self . weights , self . score , self . mode , self . flags )
padding = 90
2012-05-31 16:23:38 +04:00
num_features = self . model_interface . number_of_features
2012-01-31 13:50:20 +04:00
print ( ' \n Results of model comparison \n ' )
print ( ' { 0:< {padding} }: {1} ' . format ( ' phrase pairs in reference (tokens) ' , total_pairs , padding = padding ) )
print ( ' { 0:< {padding} }: {1} ' . format ( ' phrase pairs in model intersection (tokens) ' , intersection , padding = padding ) )
print ( ' { 0:< {padding} }: {1} \n ' . format ( ' phrase pairs in model union (tokens) ' , total_pairs - oov2 , padding = padding ) )
2012-05-31 16:23:38 +04:00
for i , data in enumerate ( results ) :
cross_entropies = data [ : num_features ]
( other_translations , oov , ignored , n , total_pairs ) = data [ num_features : ]
2012-01-31 13:50:20 +04:00
print ( ' model ' + str ( i ) )
2012-05-31 16:23:38 +04:00
for j in range ( num_features ) :
print ( ' { 0:< {padding} }: {1} ' . format ( ' cross-entropy for feature {0} ' . format ( j ) , cross_entropies [ j ] , padding = padding ) )
2012-01-31 13:50:20 +04:00
print ( ' { 0:< {padding} }: {1} ' . format ( ' phrase pairs in model (tokens) ' , n + ignored , padding = padding ) )
print ( ' { 0:< {padding} }: {1} ' . format ( ' phrase pairs in model, but not in intersection (tokens) ' , ignored , padding = padding ) )
print ( ' { 0:< {padding} }: {1} ' . format ( ' phrase pairs in union, but not in model (but source phrase is) (tokens) ' , other_translations , padding = padding ) )
print ( ' { 0:< {padding} }: {1} \n ' . format ( ' phrase pairs in union, but source phrase not in model (tokens) ' , oov , padding = padding ) )
self . flags [ ' compare_cross-entropies ' ] = False
return results , ( intersection , total_pairs , oov2 )
def compute_cross_entropy ( self ) :
""" return cross-entropy for a tuning set, a set of models and a set of weights.
analysis tool .
"""
data = [ ' reference ' , ' pt-filtered ' ]
if self . mode == ' counts ' or ( self . mode == ' interpolate ' and self . flags [ ' recompute_lexweights ' ] ) :
data . append ( ' lexical-filtered ' )
self . _ensure_loaded ( data )
current_cross_entropy = cross_entropy ( self . model_interface , self . reference_interface , self . weights , self . score , self . mode , self . flags )
sys . stderr . write ( ' Cross entropy: ' + str ( current_cross_entropy ) + ' \n ' )
return current_cross_entropy
def return_best_cross_entropy ( self ) :
""" return the set of weights and cross-entropy that is optimal for a tuning set and a set of models. """
data = [ ' reference ' , ' pt-filtered ' ]
if self . mode == ' counts ' or ( self . mode == ' interpolate ' and self . flags [ ' recompute_lexweights ' ] ) :
data . append ( ' lexical-filtered ' )
self . _ensure_loaded ( data )
best_weights , best_cross_entropy = optimize_cross_entropy ( self . model_interface , self . reference_interface , self . weights , self . score , self . mode , self . flags )
sys . stderr . write ( ' Best weights: ' + str ( best_weights ) + ' \n ' )
sys . stderr . write ( ' Cross entropies: ' + str ( best_cross_entropy ) + ' \n ' )
2012-04-17 16:03:41 +04:00
sys . stderr . write ( ' You can apply these weights with the action combine_given_weights and the option -w " {0} " \n ' . format ( ' ; ' . join ( [ ' , ' . join ( str ( w ) for w in item ) for item in best_weights ] ) ) )
2012-01-31 13:50:20 +04:00
return best_weights , best_cross_entropy
def test ( ) :
""" test (and illustrate) the functionality of the program based on two test phrase tables and a small reference set, """
# linear interpolation of two models, with fixed weights. Output uses vocabulary of model1 (since model2 is supplementary)
# command line: (currently not possible to define supplementary models through command line)
sys . stderr . write ( ' Regression test 1 \n ' )
Combiner = Combine_TMs ( [ [ os . path . join ( ' test ' , ' model1 ' ) , ' primary ' ] , [ os . path . join ( ' test ' , ' model2 ' ) , ' supplementary ' ] ] , [ 0.5 , 0.5 ] , os . path . join ( ' test ' , ' phrase-table_test1 ' ) )
Combiner . combine_given_weights ( )
# linear interpolation of two models, with fixed weights (but different for each feature).
# command line: python tmcombine.py combine_given_weights test/model1 test/model2 -w "0.1,0.9;0.1,1;0.2,0.8;0.5,0.5" -o test/phrase-table_test2
sys . stderr . write ( ' Regression test 2 \n ' )
Combiner = Combine_TMs ( [ [ os . path . join ( ' test ' , ' model1 ' ) , ' primary ' ] , [ os . path . join ( ' test ' , ' model2 ' ) , ' primary ' ] ] , [ [ 0.1 , 0.9 ] , [ 0.1 , 1 ] , [ 0.2 , 0.8 ] , [ 0.5 , 0.5 ] ] , os . path . join ( ' test ' , ' phrase-table_test2 ' ) )
Combiner . combine_given_weights ( )
# count-based combination of two models, with fixed weights
# command line: python tmcombine.py combine_given_weights test/model1 test/model2 -w "0.1,0.9;0.1,1;0.2,0.8;0.5,0.5" -o test/phrase-table_test3 -m counts
sys . stderr . write ( ' Regression test 3 \n ' )
Combiner = Combine_TMs ( [ [ os . path . join ( ' test ' , ' model1 ' ) , ' primary ' ] , [ os . path . join ( ' test ' , ' model2 ' ) , ' primary ' ] ] , [ [ 0.1 , 0.9 ] , [ 0.1 , 1 ] , [ 0.2 , 0.8 ] , [ 0.5 , 0.5 ] ] , os . path . join ( ' test ' , ' phrase-table_test3 ' ) , mode = ' counts ' )
Combiner . combine_given_weights ( )
# output phrase table should be identical to model1
# command line: python tmcombine.py combine_given_weights test/model1 -w 1 -o test/phrase-table_test4 -m counts
sys . stderr . write ( ' Regression test 4 \n ' )
Combiner = Combine_TMs ( [ [ os . path . join ( ' test ' , ' model1 ' ) , ' primary ' ] ] , [ 1 ] , os . path . join ( ' test ' , ' phrase-table_test4 ' ) , mode = ' counts ' )
Combiner . combine_given_weights ( )
# count-based combination of two models with weights set through perplexity minimization
# command line: python tmcombine.py combine_given_tuning_set test/model1 test/model2 -o test/phrase-table_test5 -m counts -r test/extract
sys . stderr . write ( ' Regression test 5 \n ' )
Combiner = Combine_TMs ( [ [ os . path . join ( ' test ' , ' model1 ' ) , ' primary ' ] , [ os . path . join ( ' test ' , ' model2 ' ) , ' primary ' ] ] , output_file = os . path . join ( ' test ' , ' phrase-table_test5 ' ) , mode = ' counts ' , reference_file = ' test/extract ' )
Combiner . combine_given_tuning_set ( )
# loglinear combination of two models with fixed weights
# command line: python tmcombine.py combine_given_weights test/model1 test/model2 -w 0.1,0.9 -o test/phrase-table_test6 -m loglinear
sys . stderr . write ( ' Regression test 6 \n ' )
Combiner = Combine_TMs ( [ [ os . path . join ( ' test ' , ' model1 ' ) , ' primary ' ] , [ os . path . join ( ' test ' , ' model2 ' ) , ' primary ' ] ] , weights = [ 0.1 , 0.9 ] , output_file = os . path . join ( ' test ' , ' phrase-table_test6 ' ) , mode = ' loglinear ' )
Combiner . combine_given_weights ( )
# cross-entropy analysis of two models through a reference set
# command line: python tmcombine.py compare_cross_entropies test/model1 test/model2 -m counts -r test/extract
sys . stderr . write ( ' Regression test 7 \n ' )
Combiner = Combine_TMs ( [ [ os . path . join ( ' test ' , ' model1 ' ) , ' primary ' ] , [ os . path . join ( ' test ' , ' model2 ' ) , ' primary ' ] ] , mode = ' counts ' , reference_file = ' test/extract ' )
f = open ( os . path . join ( ' test ' , ' phrase-table_test7 ' ) , ' w ' )
f . write ( str ( Combiner . compare_cross_entropies ( ) ) )
f . close ( )
# maximum a posteriori combination of two models (Bacchiani et al. 2004; Foster et al. 2010) with weights set through cross-entropy minimization
# command line: (currently not possible through command line)
sys . stderr . write ( ' Regression test 8 \n ' )
Combiner = Combine_TMs ( [ [ os . path . join ( ' test ' , ' model1 ' ) , ' primary ' ] , [ os . path . join ( ' test ' , ' model2 ' ) , ' map ' ] ] , output_file = os . path . join ( ' test ' , ' phrase-table_test8 ' ) , mode = ' counts ' , reference_file = ' test/extract ' )
Combiner . combine_given_tuning_set ( )
2012-05-31 16:23:38 +04:00
# count-based combination of two non-default models, with fixed weights. Same as test 3, but with the standard features moved back
# command line: python tmcombine.py combine_given_weights test/model3 test/model4 -w "0.5,0.5;0.5,0.5;0.5,0.5;0.5,0.5;0.1,0.9;0.1,1;0.2,0.8;0.5,0.5" -o test/phrase-table_test9 -m counts --number_of_features 8 --i_e2f 4 --i_e2f_lex 5 --i_f2e 6 --i_f2e_lex 7 -r test/extract
sys . stderr . write ( ' Regression test 9 \n ' )
Combiner = Combine_TMs ( [ [ os . path . join ( ' test ' , ' model3 ' ) , ' primary ' ] , [ os . path . join ( ' test ' , ' model4 ' ) , ' primary ' ] ] , [ [ 0.5 , 0.5 ] , [ 0.5 , 0.5 ] , [ 0.5 , 0.5 ] , [ 0.5 , 0.5 ] , [ 0.1 , 0.9 ] , [ 0.1 , 1 ] , [ 0.2 , 0.8 ] , [ 0.5 , 0.5 ] ] , os . path . join ( ' test ' , ' phrase-table_test9 ' ) , mode = ' counts ' , number_of_features = 8 , i_e2f = 4 , i_e2f_lex = 5 , i_f2e = 6 , i_f2e_lex = 7 )
Combiner . combine_given_weights ( )
# count-based combination of two non-default models, with fixed weights. Same as test 5, but with the standard features moved back
# command line: python tmcombine.py combine_given_tuning_set test/model3 test/model4 -o test/phrase-table_test10 -m counts --number_of_features 8 --i_e2f 4 --i_e2f_lex 5 --i_f2e 6 --i_f2e_lex 7 -r test/extract
sys . stderr . write ( ' Regression test 10 \n ' )
Combiner = Combine_TMs ( [ [ os . path . join ( ' test ' , ' model3 ' ) , ' primary ' ] , [ os . path . join ( ' test ' , ' model4 ' ) , ' primary ' ] ] , output_file = os . path . join ( ' test ' , ' phrase-table_test10 ' ) , mode = ' counts ' , number_of_features = 8 , i_e2f = 4 , i_e2f_lex = 5 , i_f2e = 6 , i_f2e_lex = 7 , reference_file = ' test/extract ' )
Combiner . combine_given_tuning_set ( )
2014-10-12 20:23:30 +04:00
# count-based combination of two hierarchical models, with fixed weights. Same as test 3, but with hierarchical models
# command line: python tmcombine.py combine_given_weights test/model5 test/model6 -w "0.1,0.9;0.1,1;0.2,0.8;0.5,0.5" -o test/phrase-table_test11 -m counts
sys . stderr . write ( ' Regression test 11 \n ' )
Combiner = Combine_TMs ( [ [ os . path . join ( ' test ' , ' model5 ' ) , ' primary ' ] , [ os . path . join ( ' test ' , ' model6 ' ) , ' primary ' ] ] , [ [ 0.1 , 0.9 ] , [ 0.1 , 1 ] , [ 0.2 , 0.8 ] , [ 0.5 , 0.5 ] ] , os . path . join ( ' test ' , ' phrase-table_test11 ' ) , mode = ' counts ' )
Combiner . combine_given_weights ( )
2012-01-31 13:50:20 +04:00
#convert weight vector passed as a command line argument
class to_list ( argparse . Action ) :
def __call__ ( self , parser , namespace , weights , option_string = None ) :
if ' ; ' in weights :
values = [ [ float ( x ) for x in vector . split ( ' , ' ) ] for vector in weights . split ( ' ; ' ) ]
else :
values = [ float ( x ) for x in weights . split ( ' , ' ) ]
setattr ( namespace , self . dest , values )
def parse_command_line ( ) :
parser = argparse . ArgumentParser ( description = ' Combine translation models. Check DOCSTRING of the class Combine_TMs() and its methods for a more in-depth documentation and additional configuration options not available through the command line. The function test() shows examples. ' )
2012-05-31 16:23:38 +04:00
group1 = parser . add_argument_group ( ' Main options ' )
group2 = parser . add_argument_group ( ' More model combination options ' )
group1 . add_argument ( ' action ' , metavar = ' ACTION ' , choices = [ " combine_given_weights " , " combine_given_tuning_set " , " combine_reordering_tables " , " compute_cross_entropy " , " return_best_cross_entropy " , " compare_cross_entropies " ] ,
2012-01-31 13:50:20 +04:00
help = ' What you want to do with the models. One of %(choices)s . ' )
2012-05-31 16:23:38 +04:00
group1 . add_argument ( ' model ' , metavar = ' DIRECTORY ' , nargs = ' + ' ,
2012-01-31 13:50:20 +04:00
help = ' Model directory. Assumes default Moses structure (i.e. path to phrase table and lexical tables). ' )
2012-05-31 16:23:38 +04:00
group1 . add_argument ( ' -w ' , ' --weights ' , dest = ' weights ' , action = to_list ,
2012-01-31 13:50:20 +04:00
default = None ,
help = ' weight vector. Format 1: single vector, one weight per model. Example: \" 0.1,0.9 \" ; format 2: one vector per feature, one weight per model: \" 0.1,0.9;0.5,0.5;0.4,0.6;0.2,0.8 \" ' )
2012-05-31 16:23:38 +04:00
group1 . add_argument ( ' -m ' , ' --mode ' , type = str ,
2012-01-31 13:50:20 +04:00
default = " interpolate " ,
choices = [ " counts " , " interpolate " , " loglinear " ] ,
help = ' basic mixture-model algorithm. Default: %(default)s . Note: depending on mode and additional configuration, additional statistics are needed. Check docstring documentation of Combine_TMs() for more info. ' )
2012-05-31 16:23:38 +04:00
group1 . add_argument ( ' -r ' , ' --reference ' , type = str ,
2012-01-31 13:50:20 +04:00
default = None ,
help = ' File containing reference phrase pairs for cross-entropy calculation. Default interface expects \' path/model/extract.gz \' that is produced by training a model on the reference (i.e. development) corpus. ' )
2012-05-31 16:23:38 +04:00
group1 . add_argument ( ' -o ' , ' --output ' , type = str ,
2012-01-31 13:50:20 +04:00
default = " - " ,
help = ' Output file (phrase table). If not specified, model is written to standard output. ' )
2012-05-31 16:23:38 +04:00
group1 . add_argument ( ' --output-lexical ' , type = str ,
2012-01-31 13:50:20 +04:00
default = None ,
help = ( ' Not only create a combined phrase table, but also combined lexical tables. Writes to OUTPUT_LEXICAL.e2f and OUTPUT_LEXICAL.f2e, or OUTPUT_LEXICAL.counts.e2f in mode \' counts \' . ' ) )
2012-05-31 16:23:38 +04:00
group1 . add_argument ( ' --lowmem ' , action = " store_true " ,
2012-01-31 13:50:20 +04:00
help = ( ' Low memory mode: requires two passes (and sorting in between) to combine a phrase table, but loads less data into memory. Only relevant for mode " counts " and some configurations of mode " interpolate " . ' ) )
2012-05-31 16:23:38 +04:00
group1 . add_argument ( ' --tempdir ' , type = str ,
2012-02-27 17:11:47 +04:00
default = None ,
help = ( ' Temporary directory in --lowmem mode. ' ) )
2012-05-31 16:23:38 +04:00
group2 . add_argument ( ' --i_e2f ' , type = int ,
default = 0 , metavar = ' N ' ,
help = ( ' Index of p(f|e) (relevant for mode counts if phrase table has custom feature order). (default: %(default)s ) ' ) )
group2 . add_argument ( ' --i_e2f_lex ' , type = int ,
default = 1 , metavar = ' N ' ,
help = ( ' Index of lex(f|e) (relevant for mode counts or with option recompute_lexweights if phrase table has custom feature order). (default: %(default)s ) ' ) )
group2 . add_argument ( ' --i_f2e ' , type = int ,
default = 2 , metavar = ' N ' ,
help = ( ' Index of p(e|f) (relevant for mode counts if phrase table has custom feature order). (default: %(default)s ) ' ) )
group2 . add_argument ( ' --i_f2e_lex ' , type = int ,
default = 3 , metavar = ' N ' ,
help = ( ' Index of lex(e|f) (relevant for mode counts or with option recompute_lexweights if phrase table has custom feature order). (default: %(default)s ) ' ) )
group2 . add_argument ( ' --number_of_features ' , type = int ,
default = 4 , metavar = ' N ' ,
help = ( ' Combine models with N + 1 features (last feature is constant phrase penalty). (default: %(default)s ) ' ) )
group2 . add_argument ( ' --normalized ' , action = " store_true " ,
help = ( ' for each phrase pair x,y: ignore models with p(y)=0, and distribute probability mass among models with p(y)>0. (default: missing entries (x,y) are always interpreted as p(x|y)=0). Only relevant in mode " interpolate " . ' ) )
2013-10-25 00:12:47 +04:00
group2 . add_argument ( ' --write-phrase-penalty ' , action = " store_true " ,
help = ( " Include phrase penalty in phrase table " ) )
2012-05-31 16:23:38 +04:00
group2 . add_argument ( ' --recompute_lexweights ' , action = " store_true " ,
help = ( ' don \' t directly interpolate lexical weights, but interpolate word translation probabilities instead and recompute the lexical weights. Only relevant in mode " interpolate " . ' ) )
2012-02-27 17:11:47 +04:00
2012-01-31 13:50:20 +04:00
return parser . parse_args ( )
if __name__ == " __main__ " :
if len ( sys . argv ) < 2 :
sys . stderr . write ( " no command specified. use option -h for usage instructions \n " )
elif sys . argv [ 1 ] == " test " :
test ( )
else :
args = parse_command_line ( )
#initialize
2012-05-31 16:23:38 +04:00
combiner = Combine_TMs ( [ ( m , ' primary ' ) for m in args . model ] ,
weights = args . weights ,
mode = args . mode ,
output_file = args . output ,
reference_file = args . reference ,
output_lexical = args . output_lexical ,
lowmem = args . lowmem ,
normalized = args . normalized ,
recompute_lexweights = args . recompute_lexweights ,
tempdir = args . tempdir ,
number_of_features = args . number_of_features ,
i_e2f = args . i_e2f ,
i_e2f_lex = args . i_e2f_lex ,
i_f2e = args . i_f2e ,
2013-10-25 00:12:47 +04:00
i_f2e_lex = args . i_f2e_lex ,
write_phrase_penalty = args . write_phrase_penalty )
2012-01-31 13:50:20 +04:00
# execute right method
f_string = " combiner. " + args . action + ' () '
exec ( f_string )