#!/usr/bin/env python # -*- coding: utf-8 -*- # Author: Rico Sennrich # This program handles the combination of Moses phrase tables, either through # linear interpolation of the phrase translation probabilities/lexical weights, # or through a recomputation based on the (weighted) combined counts. # # It also supports an automatic search for weights that minimize the cross-entropy # between the model and a tuning set of word/phrase alignments. # for usage information, run # python tmcombine.py -h # you can also check the docstrings of Combine_TMs() for more information and find some example commands in the function test() # Some general things to note: # - Different combination algorithms require different statistics. To be on the safe side, use the option `-write-lexical-counts` when training models. # - The script assumes that phrase tables are sorted (to allow incremental, more memory-friendly processing). sort with LC_ALL=C. # - Some configurations require additional statistics that are loaded in memory (lexical tables; complete list of target phrases). If memory consumption is a problem, use the option --lowmem (slightly slower and writes temporary files to disk), or consider pruning your phrase table before combining (e.g. using Johnson et al. 2007). # - The script can read/write gzipped files, but the Python implementation is slow. You're better off unzipping the files on the command line and working with the unzipped files. # - The cross-entropy estimation assumes that phrase tables contain true probability distributions (i.e. a probability mass of 1 for each conditional probability distribution). If this is not true, the results are skewed. # - Unknown phrase pairs are not considered for the cross-entropy estimation. A comparison of models with different vocabularies may be misleading. # - Don't directly compare cross-entropies obtained from a combination with different modes. Depending on how some corner cases are treated, linear interpolation does not distribute full probability mass and thus shows higher (i.e. worse) cross-entropies. from __future__ import division, unicode_literals import sys import os import gzip import argparse import copy import re from math import log, exp from collections import defaultdict from operator import mul from tempfile import NamedTemporaryFile from subprocess import Popen try: from itertools import izip except: izip = zip try: from lxml import etree as ET except: import xml.etree.cElementTree as ET try: from scipy.optimize.lbfgsb import fmin_l_bfgs_b optimizer = 'l-bfgs' except: optimizer = 'hillclimb' class Moses(): """Moses interface for loading/writing models to support other phrase table formats, subclass this and overwrite the relevant functions """ def __init__(self,models,number_of_features): self.number_of_features = number_of_features self.models = models #example item (assuming mode=='counts' and one feature): phrase_pairs['the house']['das haus'] = [[[10,100]],['0-0 1-1']] self.phrase_pairs = defaultdict(lambda: defaultdict(lambda: [[[0]*len(self.models) for i in range(self.number_of_features)],[]])) self.phrase_source = defaultdict(lambda: [0]*len(self.models)) self.phrase_target = defaultdict(lambda: [0]*len(self.models)) self.reordering_pairs = defaultdict(lambda: defaultdict(lambda: [[0]*len(self.models) for i in range(self.number_of_features)])) self.word_pairs_e2f = defaultdict(lambda: defaultdict(lambda: [0]*len(self.models))) self.word_pairs_f2e = defaultdict(lambda: defaultdict(lambda: [0]*len(self.models))) self.word_source = defaultdict(lambda: [0]*len(self.models)) self.word_target = defaultdict(lambda: [0]*len(self.models)) self.require_alignment = False def open_table(self,model,table,mode='r'): """define which paths to open for lexical tables and phrase tables. we assume canonical Moses structure, but feel free to overwrite this """ if table == 'reordering-table': table = 'reordering-table.wbe-msd-bidirectional-fe' filename = os.path.join(model,'model',table) fileobj = handle_file(filename,'open',mode) return fileobj def load_phrase_features(self,line,priority,i,mode='interpolate',store='pairs',filter_by=None,filter_by_src=None,filter_by_target=None,inverted=False,flags=None): """take single phrase table line and store probablities in internal data structure""" src = line[0] target = line[1] if inverted: src,target = target,src if (store == 'all' or store == 'pairs') and (priority < 10 or (src in self.phrase_pairs and target in self.phrase_pairs[src])) and not (filter_by and not (src in filter_by and target in filter_by[src])): self.store_info(src,target,line) scores = line[2].split() if len(scores) = 5: if not self.phrase_pairs[src][target][1]: self.phrase_pairs[src][target][1] = line[3:] # assuming that alignment is empty elif len(line) == 4: if self.require_alignment: sys.stderr.write('Error: unexpected phrase table format. Your current configuration requires alignment information. Make sure you trained your model with -phrase-word-alignment (default in newer Moses versions)\n') exit(1) self.phrase_pairs[src][target][1] = [b'',line[3].lstrip(b'| ')] else: sys.stderr.write('Error: unexpected phrase table format. Are you using a very old/new version of Moses with different formatting?\n') exit(1) def get_word_alignments(self,src,target,cache=False,mycache={}): """from the Moses phrase table alignment info in the form "0-0 1-0", get the aligned word pairs / NULL alignments """ if cache: if (src,target) in mycache: return mycache[(src,target)] try: alignment = self.phrase_pairs[src][target][1][0] except: return None,None src_list = src.split(b' ') target_list = target.split(b' ') textual_e2f = [[s,[]] for s in src_list] textual_f2e = [[t,[]] for t in target_list] for pair in alignment.split(b' '): s,t = pair.split(b'-') s,t = int(s),int(t) textual_e2f[s][1].append(target_list[t]) textual_f2e[t][1].append(src_list[s]) for s,t in textual_e2f: if not t: t.append(b'NULL') for s,t in textual_f2e: if not t: t.append(b'NULL') #tupelize so we can use the value as dictionary keys for i in range(len(textual_e2f)): textual_e2f[i][1] = tuple(textual_e2f[i][1]) for i in range(len(textual_f2e)): textual_f2e[i][1] = tuple(textual_f2e[i][1]) if cache: mycache[(src,target)] = textual_e2f,textual_f2e return textual_e2f,textual_f2e def write_phrase_table(self,src,target,weights,features,mode,flags): """convert data to string in Moses phrase table format""" # if one feature value is 0 (either because of loglinear interpolation or rounding to 0), don't write it to phrasetable # (phrase pair will end up with probability zero in log-linear model anyway) if 0 in features: return b'' # information specific to Moses model: alignment info and comment section with target and source counts additional_entries = self.phrase_pairs[src][target][1] alignment = additional_entries[0] if alignment: extra_space = b' ' else: extra_space = b'' if mode == 'counts': i_e2f = flags['i_e2f'] i_f2e = flags['i_f2e'] srccount = dot_product(self.phrase_source[src],weights[i_f2e]) targetcount = dot_product(self.phrase_target[target],weights[i_e2f]) additional_entries[1] = b"%s %s" %(targetcount,srccount) features = b' '.join([b'%.6g' %(f) for f in features]) if flags['add_origin_features']: origin_features = list(map(lambda x: 2.718**bool(x),self.phrase_pairs[src][target][0][0])) # 1 if phrase pair doesn't occur in model, 2.718 if it does origin_features = b' '.join([b'%.4f' %(f) for f in origin_features]) + ' ' else: origin_features = b'' if flags['write_phrase_penalty']: phrase_penalty = b' 2.718' else: phrase_penalty = b'' line = b"%s ||| %s ||| %s%s %s||| %s%s||| %s\n" %(src,target,features,origin_features,phrase_penalty,alignment,extra_space,b' ||| '.join(additional_entries[1:])) return line def write_lexical_file(self,direction, path, weights,mode): if mode == 'counts': bridge = '.counts' else: bridge = '' fobj = handle_file("{0}{1}.{2}".format(path,bridge,direction),'open',mode='w') sys.stderr.write('Writing {0}{1}.{2}\n'.format(path,bridge,direction)) if direction == 'e2f': word_pairs = self.word_pairs_e2f marginal = self.word_target elif direction == 'f2e': word_pairs = self.word_pairs_f2e marginal = self.word_source for x in sorted(word_pairs): for y in sorted(word_pairs[x]): xy = dot_product(word_pairs[x][y],weights) fobj.write(b"%s %s %s" %(x,y,xy)) if mode == 'counts': fobj.write(b" %s\n" %(dot_product(marginal[y],weights))) else: fobj.write(b'\n') handle_file("{0}{1}.{2}".format(path,bridge,direction),'close',fobj,mode='w') def write_reordering_table(self,src,target,features): """convert data to string in Moses reordering table format""" # if one feature value is 0 (either because of loglinear interpolation or rounding to 0), don't write it to reordering table # (phrase pair will end up with probability zero in log-linear model anyway) if 0 in features: return b'' features = b' '.join([b'%.6g' %(f) for f in features]) line = b"%s ||| %s ||| %s\n" %(src,target,features) return line def create_inverse(self,fobj,tempdir=None): """swap source and target phrase in the phrase table, and then sort (by target phrase)""" inverse = NamedTemporaryFile(prefix='inv_unsorted',delete=False,dir=tempdir) swap = re.compile(b'(.+?) \|\|\| (.+?) \|\|\|') # just swap source and target phrase, and leave order of scores etc. intact. # For better compatibility with existing codebase, we swap the order of the phrases back for processing for line in fobj: inverse.write(swap.sub(b'\\2 ||| \\1 |||',line,1)) inverse.close() inverse_sorted = sort_file(inverse.name,tempdir=tempdir) os.remove(inverse.name) return inverse_sorted def merge(self,pt_normal, pt_inverse, pt_out, mode='interpolate'): """merge two phrasetables (the latter having been inverted to calculate p(s|t) and lex(s|t) in sorted order) Assumes that p(s|t) and lex(s|t) are in first table half, p(t|s) and lex(t|s) in second""" for line,line2 in izip(pt_normal,pt_inverse): line = line.split(b' ||| ') if line[-1].endswith(b' |||'): line[-1] = line[-1][:-4] line.append('') line2 = line2.split(b' ||| ') if line2[-1].endswith(b' |||'): line2[-1] = line2[-1][:-4] line2.append('') #scores mid = int(self.number_of_features/2) scores1 = line[2].split() scores2 = line2[2].split() line[2] = b' '.join(scores2[:mid]+scores1[mid:]) # marginal counts if mode == 'counts': src_count = line[4].split()[1] target_count = line2[-1].split()[0] line[4] = b' '.join([target_count,src_count]) pt_out.write(b' ||| '.join(line)+ b'\n') pt_normal.close() pt_inverse.close() pt_out.close() class TigerXML(): """interface to load reference word alignments from TigerXML corpus. Tested on SMULTRON (http://kitt.cl.uzh.ch/kitt/smultron/) """ def __init__(self,alignment_xml): """only argument is TigerXML file """ self.treebanks = self._open_treebanks(alignment_xml) self.word_pairs = defaultdict(lambda: defaultdict(int)) self.word_source = defaultdict(int) self.word_target = defaultdict(int) def load_word_pairs(self,src,target): """load word pairs. src and target are the itentifiers of the source and target language in the XML""" if not src or not target: sys.stderr.write('Error: Source and/or target language not specified. Required for TigerXML extraction.\n') exit(1) alignments = self._get_aligned_ids(src,target) self._textualize_alignments(src,target,alignments) def _open_treebanks(self,alignment_xml): """Parallel XML format references monolingual files. Open all.""" alignment_path = os.path.dirname(alignment_xml) align_xml = ET.parse(alignment_xml) treebanks = {} treebanks['aligned'] = align_xml for treebank in align_xml.findall('//treebank'): treebank_id = treebank.get('id') filename = treebank.get('filename') if not os.path.isabs(filename): filename = os.path.join(alignment_path,filename) treebanks[treebank_id] = ET.parse(filename) return treebanks def _get_aligned_ids(self,src,target): """first step: find which nodes are aligned.""" alignments = [] ids = defaultdict(dict) for alignment in self.treebanks['aligned'].findall('//align'): newpair = {} if len(alignment) != 2: sys.stderr.write('Error: alignment with ' + str(len(alignment)) + ' children. Expected 2. Skipping.\n') continue for node in alignment: lang = node.get('treebank_id') node_id = node.get('node_id') newpair[lang] = node_id if not (src in newpair and target in newpair): sys.stderr.write('Error: source and target languages don\'t match. Skipping.\n') continue # every token may only appear in one alignment pair; # if it occurs in multiple, we interpret them as one 1-to-many or many-to-1 alignment if newpair[src] in ids[src]: idx = ids[src][newpair[src]] alignments[idx][1].append(newpair[target]) elif newpair[target] in ids[target]: idx = ids[target][newpair[target]] alignments[idx][0].append(newpair[src]) else: idx = len(alignments) alignments.append(([newpair[src]],[newpair[target]])) ids[src][newpair[src]] = idx ids[target][newpair[target]] = idx alignments = self._discard_discontinuous(alignments) return alignments def _discard_discontinuous(self,alignments): """discard discontinuous word sequences (which we can't use for phrase-based SMT systems) and make sure that sequence is in correct order. """ new_alignments = [] for alignment in alignments: new_pair = [] for sequence in alignment: sequence_split = [t_id.split('_') for t_id in sequence] #check if all words come from the same sentence sentences = [item[0] for item in sequence_split] if not len(set(sentences)) == 1: #sys.stderr.write('Warning. Word sequence crossing sentence boundary. Discarding.\n') #sys.stderr.write(str(sequence)+'\n') continue #sort words and check for discontinuities. try: tokens = sorted([int(item[1]) for item in sequence_split]) except ValueError: #sys.stderr.write('Warning. Not valid word IDs. Discarding.\n') #sys.stderr.write(str(sequence)+'\n') continue if not tokens[-1]-tokens[0] == len(tokens)-1: #sys.stderr.write('Warning. Discontinuous word sequence(?). Discarding.\n') #sys.stderr.write(str(sequence)+'\n') continue out_sequence = [sentences[0]+'_'+str(token) for token in tokens] new_pair.append(out_sequence) if len(new_pair) == 2: new_alignments.append(new_pair) return new_alignments def _textualize_alignments(self,src,target,alignments): """Knowing which nodes are aligned, get actual words that are aligned.""" words = defaultdict(dict) for text in [text for text in self.treebanks if not text == 'aligned']: #TODO: Make lowercasing optional for terminal in self.treebanks[text].findall('//t'): words[text][terminal.get('id')] = terminal.get('word').lower() for (src_ids, target_ids) in alignments: try: src_text = ' '.join((words[src][src_id] for src_id in src_ids)) except KeyError: #sys.stderr.write('Warning. ID not found: '+ str(src_ids) +'\n') continue try: target_text = ' '.join((words[target][target_id] for target_id in target_ids)) except KeyError: #sys.stderr.write('Warning. ID not found: '+ str(target_ids) +'\n') continue self.word_pairs[src_text][target_text] += 1 self.word_source[src_text] += 1 self.word_target[target_text] += 1 class Moses_Alignment(): """interface to load reference phrase alignment from corpus aligend with Giza++ and with extraction heuristics as applied by the Moses toolkit. """ def __init__(self,alignment_file): self.alignment_file = alignment_file self.word_pairs = defaultdict(lambda: defaultdict(int)) self.word_source = defaultdict(int) self.word_target = defaultdict(int) def load_word_pairs(self,src_lang,target_lang): """main function. overwrite this to import data in different format.""" fileobj = handle_file(self.alignment_file,'open','r') for line in fileobj: line = line.split(b' ||| ') if line[-1].endswith(b' |||'): line[-1] = line[-1][:-4] line.append(b'') src = line[0] target = line[1] self.word_pairs[src][target] += 1 self.word_source[src] += 1 self.word_target[target] += 1 def dot_product(a,b): """calculate dot product from two lists""" # optimized for PyPy (much faster than enumerate/map) s = 0 i = 0 for x in a: s += x * b[i] i += 1 return s def priority_sort_models(models): """primary models should have priority before supplementary models. zipped with index to know which weight model belongs to """ return [(model,priority,i) for (i,(model,priority)) in sorted(zip(range(len(models)),models),key=lambda x: x[1][1])] def cross_entropy(model_interface,reference_interface,weights,score,mode,flags): """calculate cross entropy given all necessary information. don't call this directly, but use one of the Combine_TMs methods. """ weights = normalize_weights(weights,mode,flags) if 'compare_cross-entropies' in flags and flags['compare_cross-entropies']: num_results = len(model_interface.models) else: num_results = 1 cross_entropies = [[0]*num_results for i in range(model_interface.number_of_features)] oov = [0]*num_results oov2 = 0 other_translations = [0]*num_results ignored = [0]*num_results n = [0]*num_results total_pairs = 0 for src in reference_interface.word_pairs: for target in reference_interface.word_pairs[src]: c = reference_interface.word_pairs[src][target] for i in range(num_results): if src in model_interface.phrase_pairs and target in model_interface.phrase_pairs[src]: if ('compare_cross-entropies' in flags and flags['compare_cross-entropies']) or ('intersected_cross-entropies' in flags and flags['intersected_cross-entropies']): if 0 in model_interface.phrase_pairs[src][target][0][0]: #only use intersection of models for comparability # update unknown words statistics if model_interface.phrase_pairs[src][target][0][0][i]: ignored[i] += c elif src in model_interface.phrase_source and model_interface.phrase_source[src][i]: other_translations[i] += c else: oov[i] += c continue if ('compare_cross-entropies' in flags and flags['compare_cross-entropies']): tmp_weights = [[0]*i+[1]+[0]*(num_results-i-1)]*model_interface.number_of_features elif ('intersected_cross-entropies' in flags and flags['intersected_cross-entropies']): tmp_weights = weights features = score(tmp_weights,src,target,model_interface,flags) else: features = score(weights,src,target,model_interface,flags) #if weight is so low that feature gets probability zero if 0 in features: #sys.stderr.write('Warning: 0 probability in model {0}: source phrase: {1!r}; target phrase: {2!r}\n'.format(i,src,target)) #sys.stderr.write('Possible reasons: 0 probability in phrase table; very low (or 0) weight; recompute lexweight and different alignments\n') #sys.stderr.write('Phrase pair is ignored for cross_entropy calculation\n\n') continue n[i] += c for j in range(model_interface.number_of_features): cross_entropies[j][i] -= log(features[j],2)*c elif src in model_interface.phrase_source and not ('compare_cross-entropies' in flags and flags['compare_cross-entropies']): other_translations[i] += c else: oov2 += c total_pairs += c oov2 = int(oov2/num_results) for i in range(num_results): try: for j in range(model_interface.number_of_features): cross_entropies[j][i] /= n[i] except ZeroDivisionError: sys.stderr.write('Warning: no matching phrase pairs between reference set and model\n') for j in range(model_interface.number_of_features): cross_entropies[j][i] = 0 if 'compare_cross-entropies' in flags and flags['compare_cross-entropies']: return [tuple([ce[i] for ce in cross_entropies]) + (other_translations[i],oov[i],ignored[i],n[i],total_pairs) for i in range(num_results)], (n[0],total_pairs,oov2) else: return tuple([ce[0] for ce in cross_entropies]) + (other_translations[0],oov2,total_pairs) def cross_entropy_light(model_interface,reference_interface,weights,score,mode,flags,cache): """calculate cross entropy given all necessary information. don't call this directly, but use one of the Combine_TMs methods. Same as cross_entropy, but optimized for speed: it doesn't generate all of the statistics, doesn't normalize, and uses caching. """ weights = normalize_weights(weights,mode,flags) cross_entropies = [0]*model_interface.number_of_features for (src,target,c) in cache: features = score(weights,src,target,model_interface,flags,cache=True) if 0 in features: #sys.stderr.write('Warning: 0 probability in model {0}: source phrase: {1!r}; target phrase: {2!r}\n'.format(i,src,target)) #sys.stderr.write('Possible reasons: 0 probability in phrase table; very low (or 0) weight; recompute lexweight and different alignments\n') #sys.stderr.write('Phrase pair is ignored for cross_entropy calculation\n\n') continue for i in range(model_interface.number_of_features): cross_entropies[i] -= log(features[i],2)*c return cross_entropies def _get_reference_cache(reference_interface,model_interface): """creates a data structure that allows for a quick access to all relevant reference set phrase/word pairs and their frequencies. """ cache = [] n = 0 for src in reference_interface.word_pairs: for target in reference_interface.word_pairs[src]: if src in model_interface.phrase_pairs and target in model_interface.phrase_pairs[src]: c = reference_interface.word_pairs[src][target] cache.append((src,target,c)) n += c return cache,n def _get_lexical_filter(reference_interface,model_interface): """returns dictionaries that store the words and word pairs needed for perplexity optimization. We can use these dicts to load fewer data into memory for optimization.""" e2f_filter = defaultdict(set) f2e_filter = defaultdict(set) for src in reference_interface.word_pairs: for target in reference_interface.word_pairs[src]: if src in model_interface.phrase_pairs and target in model_interface.phrase_pairs[src]: e2f_alignment,f2e_alignment = model_interface.get_word_alignments(src,target) for s,t_list in e2f_alignment: for t in t_list: e2f_filter[s].add(t) for t,s_list in f2e_alignment: for s in s_list: f2e_filter[t].add(s) return e2f_filter,f2e_filter def _hillclimb_move(weights,stepsize,mode,flags): """Move function for hillclimb algorithm. Updates each weight by stepsize.""" for i,w in enumerate(weights): yield normalize_weights(weights[:i]+[w+stepsize]+weights[i+1:],mode,flags) for i,w in enumerate(weights): new = w-stepsize if new >= 1e-10: yield normalize_weights(weights[:i]+[new]+weights[i+1:],mode,flags) def _hillclimb(scores,best_weights,objective,model_interface,reference_interface,score_function,mode,flags,precision,cache,n): """first (deprecated) implementation of iterative weight optimization.""" best = objective(best_weights) i = 0 #counts number of iterations with same stepsize: if greater than 10, it is doubled stepsize = 512 # initial stepsize move = 1 #whether we found a better set of weights in the current iteration. if not, it is halfed sys.stderr.write('Hillclimb: step size: ' + str(stepsize)) while stepsize > 0.0078: if not move: stepsize /= 2 sys.stderr.write(' ' + str(stepsize)) i = 0 move = 1 continue move = 0 for w in _hillclimb_move(list(best_weights),stepsize,mode,flags): weights_tuple = tuple(w) if weights_tuple in scores: continue scores[weights_tuple] = cross_entropy_light(model_interface,reference_interface,[w for m in range(model_interface.number_of_features)],score_function,mode,flags,cache) if objective(weights_tuple)+precision < best: best = objective(weights_tuple) best_weights = weights_tuple move = 1 if i and not i % 10: sys.stderr.write('\nIteration '+ str(i) + ' with stepsize ' + str(stepsize) + '. current cross-entropy: ' + str(best) + '- weights: ' + str(best_weights) + ' ') stepsize *= 2 sys.stderr.write('\nIncreasing stepsize: '+ str(stepsize)) i = 0 i += 1 return best_weights def optimize_cross_entropy_hillclimb(model_interface,reference_interface,initial_weights,score_function,mode,flags,precision=0.000001): """find weights that minimize cross-entropy on a tuning set deprecated (default is now L-BFGS (optimize_cross_entropy)), but left in for people without SciPy """ scores = {} best_weights = tuple(initial_weights[0]) cache,n = _get_reference_cache(reference_interface,model_interface) # each objective is a triple: which score to minimize from cross_entropy(), which weights to update accordingly, and a comment that is printed objectives = [(lambda x: scores[x][i]/n,[i],'minimize cross-entropy for feature {0}'.format(i)) for i in range(model_interface.number_of_features)] scores[best_weights] = cross_entropy_light(model_interface,reference_interface,initial_weights,score_function,mode,flags,cache) final_weights = initial_weights[:] final_cross_entropy = [0]*model_interface.number_of_features for i,(objective, features, comment) in enumerate(objectives): best_weights = min(scores,key=objective) sys.stderr.write('Optimizing objective "' + comment +'"\n') best_weights = _hillclimb(scores,best_weights,objective,model_interface,reference_interface,score_function,feature_specific_mode(mode,i,flags),flags,precision,cache,n) sys.stderr.write('\nCross-entropy:' + str(objective(best_weights)) + ' - weights: ' + str(best_weights)+'\n\n') for j in features: final_weights[j] = list(best_weights) final_cross_entropy[j] = objective(best_weights) return final_weights,final_cross_entropy def optimize_cross_entropy(model_interface,reference_interface,initial_weights,score_function,mode,flags): """find weights that minimize cross-entropy on a tuning set Uses L-BFGS optimization and requires SciPy """ if not optimizer == 'l-bfgs': sys.stderr.write('SciPy is not installed. Falling back to naive hillclimb optimization (instead of L-BFGS)\n') return optimize_cross_entropy_hillclimb(model_interface,reference_interface,initial_weights,score_function,mode,flags) cache,n = _get_reference_cache(reference_interface,model_interface) # each objective is a triple: which score to minimize from cross_entropy(), which weights to update accordingly, and a comment that is printed objectives = [(lambda w: cross_entropy_light(model_interface,reference_interface,[[1]+list(w) for m in range(model_interface.number_of_features)],score_function,feature_specific_mode(mode,i,flags),flags,cache)[i],[i],'minimize cross-entropy for feature {0}'.format(i)) for i in range(model_interface.number_of_features)] #optimize cross-entropy for p(s|t) final_weights = initial_weights[:] final_cross_entropy = [0]*model_interface.number_of_features for i,(objective, features, comment) in enumerate(objectives): sys.stderr.write('Optimizing objective "' + comment +'"\n') initial_values = [1]*(len(model_interface.models)-1) # we leave value of first model at 1 and optimize all others (normalized of course) best_weights, best_point, data = fmin_l_bfgs_b(objective,initial_values,approx_grad=True,bounds=[(0.000000001,None)]*len(initial_values)) best_weights = normalize_weights([1]+list(best_weights),feature_specific_mode(mode,i,flags),flags) sys.stderr.write('Cross-entropy after L-BFGS optimization: ' + str(best_point/n) + ' - weights: ' + str(best_weights)+'\n') for j in features: final_weights[j] = list(best_weights) final_cross_entropy[j] = best_point/n return final_weights,final_cross_entropy def feature_specific_mode(mode,i,flags): """in mode 'counts', only the default Moses features can be recomputed from raw frequencies; all other features are interpolated by default. This fucntion mostly serves optical purposes (i.e. normalizing a single weight vector for logging), since normalize_weights also handles a mix of interpolated and recomputed features. """ if mode == 'counts' and i not in [flags['i_e2f'],flags['i_e2f_lex'],flags['i_f2e'],flags['i_f2e_lex']]: return 'interpolate' else: return mode def redistribute_probability_mass(weights,src,target,interface,flags,mode='interpolate'): """the conditional probability p(x|y) is undefined for cases where p(y) = 0 this function redistributes the probability mass to only consider models for which p(y) > 0 """ i_e2f = flags['i_e2f'] i_e2f_lex = flags['i_e2f_lex'] i_f2e = flags['i_f2e'] i_f2e_lex = flags['i_f2e_lex'] new_weights = weights[:] if flags['normalize_s_given_t'] == 's': # set weight to 0 for all models where target phrase is unseen (p(s|t) new_weights[i_e2f] = list(map(mul,interface.phrase_source[src],weights[i_e2f])) if flags['normalize-lexical_weights']: new_weights[i_e2f_lex] = list(map(mul,interface.phrase_source[src],weights[i_e2f_lex])) elif flags['normalize_s_given_t'] == 't': # set weight to 0 for all models where target phrase is unseen (p(s|t) new_weights[i_e2f] = list(map(mul,interface.phrase_target[target],weights[i_e2f])) if flags['normalize-lexical_weights']: new_weights[i_e2f_lex] = list(map(mul,interface.phrase_target[target],weights[i_e2f_lex])) # set weight to 0 for all models where source phrase is unseen (p(t|s) new_weights[i_f2e] = list(map(mul,interface.phrase_source[src],weights[i_f2e])) if flags['normalize-lexical_weights']: new_weights[i_f2e_lex] = list(map(mul,interface.phrase_source[src],weights[i_f2e_lex])) return normalize_weights(new_weights,mode,flags) def score_interpolate(weights,src,target,interface,flags,cache=False): """linear interpolation of probabilites (and other feature values) if normalized is True, the probability mass for p(x|y) is redistributed to models with p(y) > 0 """ model_values = interface.phrase_pairs[src][target][0] scores = [0]*len(model_values) if 'normalized' in flags and flags['normalized']: normalized_weights = redistribute_probability_mass(weights,src,target,interface,flags) else: normalized_weights = weights if 'recompute_lexweights' in flags and flags['recompute_lexweights']: e2f_alignment,f2e_alignment = interface.get_word_alignments(src,target,cache=cache) if not e2f_alignment or not f2e_alignment: sys.stderr.write('Error: no word alignments found, but necessary for lexical weight computation.\n') lst = 0 lts = 0 else: scores[flags['i_e2f_lex']] = compute_lexicalweight(normalized_weights[flags['i_e2f_lex']],e2f_alignment,interface.word_pairs_e2f,None,mode='interpolate') scores[flags['i_f2e_lex']] = compute_lexicalweight(normalized_weights[flags['i_f2e_lex']],f2e_alignment,interface.word_pairs_f2e,None,mode='interpolate') for idx,prob in enumerate(model_values): if not ('recompute_lexweights' in flags and flags['recompute_lexweights'] and (idx == flags['i_e2f_lex'] or idx == flags['i_f2e_lex'])): scores[idx] = dot_product(prob,normalized_weights[idx]) return scores def score_loglinear(weights,src,target,interface,flags,cache=False): """loglinear interpolation of probabilites warning: if phrase pair does not occur in all models, resulting probability is 0 this is usually not what you want - loglinear scoring is only included for completeness' sake """ scores = [] model_values = interface.phrase_pairs[src][target][0] for idx,prob in enumerate(model_values): try: scores.append(exp(dot_product(list(map(log,prob)),weights[idx]))) except ValueError: scores.append(0) return scores def score_counts(weights,src,target,interface,flags,cache=False): """count-based re-estimation of probabilites and lexical weights each count is multiplied by its weight; trivial case is weight 1 for each model, which corresponds to a concatentation """ i_e2f = flags['i_e2f'] i_e2f_lex = flags['i_e2f_lex'] i_f2e = flags['i_f2e'] i_f2e_lex = flags['i_f2e_lex'] # if we have non-default number of weights, assume that we might have to do a mix of count-based and interpolated scores. if len(weights) == 4: scores = [0]*len(weights) else: scores = score_interpolate(weights,src,target,interface,flags,cache=cache) try: joined_count = dot_product(interface.phrase_pairs[src][target][0][i_e2f],weights[i_e2f]) target_count = dot_product(interface.phrase_target[target],weights[i_e2f]) scores[i_e2f] = joined_count / target_count except ZeroDivisionError: scores[i_e2f] = 0 try: joined_count = dot_product(interface.phrase_pairs[src][target][0][i_f2e],weights[i_f2e]) source_count = dot_product(interface.phrase_source[src],weights[i_f2e]) scores[i_f2e] = joined_count / source_count except ZeroDivisionError: scores[i_f2e] = 0 e2f_alignment,f2e_alignment = interface.get_word_alignments(src,target,cache=cache) if not e2f_alignment or not f2e_alignment: sys.stderr.write('Error: no word alignments found, but necessary for lexical weight computation.\n') scores[i_e2f_lex] = 0 scores[i_f2e_lex] = 0 else: scores[i_e2f_lex] = compute_lexicalweight(weights[i_e2f_lex],e2f_alignment,interface.word_pairs_e2f,interface.word_target,mode='counts',cache=cache) scores[i_f2e_lex] = compute_lexicalweight(weights[i_f2e_lex],f2e_alignment,interface.word_pairs_f2e,interface.word_source,mode='counts',cache=cache) return scores def score_interpolate_reordering(weights,src,target,interface): """linear interpolation of reordering model probabilities also normalizes model so that """ model_values = interface.reordering_pairs[src][target] scores = [0]*len(model_values) for idx,prob in enumerate(model_values): scores[idx] = dot_product(prob,weights[idx]) #normalizes first half and last half probabilities (so that each half sums to one). #only makes sense for bidirectional configuration in Moses. Remove/change this if you want a different (or no) normalization scores = normalize_weights(scores[:int(interface.number_of_features/2)],'interpolate') + normalize_weights(scores[int(interface.number_of_features/2):],'interpolate') return scores def compute_lexicalweight(weights,alignment,word_pairs,marginal,mode='counts',cache=False,mycache=[0,defaultdict(dict)]): """compute the lexical weights as implemented in Moses toolkit""" lex = 1 # new weights: empty cache if cache and mycache[0] != weights: mycache[0] = weights mycache[1] = defaultdict(dict) for x,translations in alignment: # skip nonterminals if x.startswith(b'['): continue if cache and translations in mycache[1][x]: lex_step = mycache[1][x][translations] else: lex_step = 0 for y in translations: if mode == 'counts': lex_step += dot_product(word_pairs[x][y],weights) / dot_product(marginal[y],weights) elif mode == 'interpolate': lex_step += dot_product(word_pairs[x][y],weights) lex_step /= len(translations) if cache: mycache[1][x][translations] = lex_step lex *= lex_step return lex def normalize_weights(weights,mode,flags=None): """make sure that probability mass in linear interpolation is 1 for weighted counts, weight of first model is set to 1 """ if mode == 'interpolate' or mode == 'loglinear': if type(weights[0]) == list: new_weights = [] for weight_list in weights: total = sum(weight_list) try: weight_list = [weight/total for weight in weight_list] except ZeroDivisionError: sys.stderr.write('Error: Zero division in weight normalization. Are some of your weights zero? This might lead to undefined behaviour if a phrase pair is only seen in model with weight 0\n') new_weights.append(weight_list) else: total = sum(weights) try: new_weights = [weight/total for weight in weights] except ZeroDivisionError: sys.stderr.write('Error: Zero division in weight normalization. Are some of your weights zero? This might lead to undefined behaviour if a phrase pair is only seen in model with weight 0\n') elif mode == 'counts_pure': if type(weights[0]) == list: new_weights = [] for weight_list in weights: ratio = 1/weight_list[0] new_weights.append([weight * ratio for weight in weight_list]) else: ratio = 1/weights[0] new_weights = [weight * ratio for weight in weights] # make sure that features other than the standard Moses features are always interpolated (since no count-based computation is defined) elif mode == 'counts': if type(weights[0]) == list: norm_counts = normalize_weights(weights,'counts_pure') new_weights = normalize_weights(weights,'interpolate') for i in [flags['i_e2f'],flags['i_e2f_lex'],flags['i_f2e'],flags['i_f2e_lex']]: new_weights[i] = norm_counts[i] return new_weights else: return normalize_weights(weights,'counts_pure') return new_weights def handle_file(filename,action,fileobj=None,mode='r'): """support reading/writing either from/to file, stdout or gzipped file""" if action == 'open': if mode == 'r': mode = 'rb' elif mode == 'w': mode = 'wb' if mode == 'rb' and not filename == '-' and not os.path.exists(filename): if os.path.exists(filename+'.gz'): filename = filename+'.gz' else: sys.stderr.write('Error: unable to open file. ' + filename + ' - aborting.\n') if 'counts' in filename and os.path.exists(os.path.dirname(filename)): sys.stderr.write('For a weighted counts combination, we need statistics that Moses doesn\'t write to disk by default.\n') sys.stderr.write('Repeat step 4 of Moses training for all models with the option -write-lexical-counts.\n') exit(1) if filename.endswith('.gz'): fileobj = gzip.open(filename,mode) elif filename == '-' and mode == 'wb': fileobj = sys.stdout else: fileobj = open(filename,mode) return fileobj elif action == 'close' and filename != '-': fileobj.close() def sort_file(filename,tempdir=None): """Sort a file and return temporary file""" cmd = ['sort', filename] env = {} env['LC_ALL'] = 'C' if tempdir: cmd.extend(['-T',tempdir]) outfile = NamedTemporaryFile(delete=False,dir=tempdir) sys.stderr.write('LC_ALL=C ' + ' '.join(cmd) + ' > ' + outfile.name + '\n') p = Popen(cmd,env=env,stdout=outfile.file) p.wait() outfile.seek(0) return outfile class Combine_TMs(): """This class handles the various options, checks them for sanity and has methods that define what models to load and what functions to call for the different tasks. Typically, you only need to interact with this class and its attributes. """ #some flags that change the behaviour during scoring. See init docstring for more info flags = {'normalized':False, 'recompute_lexweights':False, 'intersected_cross-entropies':False, 'normalize_s_given_t':None, 'normalize-lexical_weights':True, 'add_origin_features':False, 'write_phrase_penalty':False, 'lowmem': False, 'i_e2f':0, 'i_e2f_lex':1, 'i_f2e':2, 'i_f2e_lex':3 } # each model needs a priority. See init docstring for more info _priorities = {'primary':1, 'map':2, 'supplementary':10} def __init__(self,models,weights=None, output_file=None, mode='interpolate', number_of_features=4, model_interface=Moses, reference_interface=Moses_Alignment, reference_file=None, lang_src=None, lang_target=None, output_lexical=None, **flags): """The whole configuration of the task is done during intialization. Afterwards, you only need to call your intended method(s). You can change some of the class attributes afterwards (such as the weights, or the output file), but you should never change the models or mode after initialization. See unit_test function for example configurations models: list of tuples (path,priority) that defines which models to process. Path is usually the top directory of a Moses model. There are three priorities: 'primary': phrase pairs with this priority will always be included in output model. For most purposes, you'll want to define all models as primary. 'map': for maximum a-posteriori combination (Bacchiani et al. 2004; Foster et al. 2010). for use with mode 'counts'. stores c(t) = 1 and c(s,t) = p(s|t) 'supplementary': phrase pairs are considered for probability computation, but not included in output model (unless they also occur in at least one primary model) useful for rescoring a model without changing its vocabulary. weights: accept two types of weight declarations: one weight per model, and one weight per model and feature type one is internally converted to type two. For 2 models with four features, this looks like: [0.1,0.9] -> [[0.1,0.9],[0.1,0.9],[0.1,0.9],[0.1,0.9]] default: uniform weights (None) output_file: filepath of output phrase table. If it ends with .gz, file is automatically zipped. output_lexical: If defined, also writes combined lexical tables. Writes to output_lexical.e2f and output_lexical.f2e, or output_lexical.counts.e2f in mode 'counts'. mode: declares the basic mixture-model algorithm. there are currently three options: 'counts': weighted counts (requires some statistics that Moses doesn't produce. Repeat step 4 of Moses training with the option -write-lexical-counts to obtain them.) Only the standard Moses features are recomputed from weighted counts; additional features are linearly interpolated (see number_of_features to allow more features, and i_e2f etc. if the standard features are in a non-standard position) 'interpolate': linear interpolation 'loglinear': loglinear interpolation (careful: this creates the intersection of phrase tables and is often of little use) number_of_features: could be used to interpolate models with non-default Moses features. 4 features is currently still hardcoded in various places (e.g. cross_entropy calculations, mode 'counts') i_e2f,i_e2f_lex,i_f2e,i_f2e_lex: Index of the (Moses) phrase table features p(s|t), lex(s|t), p(t|s) and lex(t|s). Relevant for mode 'counts', and if 'recompute_lexweights' is True in mode 'interpolate'. In mode 'counts', any additional features are combined through linear interpolation. model_interface: class that handles reading phrase tables and lexical tables, and writing phrase tables. Currently only Moses is implemented. default: Moses reference_interace: class that deals with reading in reference phrase pairs for cross-entropy computation Moses_Alignment: Word/phrase pairs as computed by Giza++ and extracted through Moses heuristics. This corresponds to the file model/extract.gz if you train a Moses model on your tuning set. TigerXML: TigerXML data format default: Moses_Alignment reference_file: path to reference file. Required for every operation except combination of models with given weights. lang_src: source language. Only required if reference_interface is TigerXML. Identifies which language in XML file we should treat as source language. lang_target: target language. Only required if reference_interface is TigerXML. Identifies which language in XML file we should treat as target language. intersected_cross-entropies: compute cross-entropies of intersection of phrase pairs, ignoring phrase pairs that do not occur in all models. If False, algorithm operates on union of phrase pairs default: False add_origin_features: For each model that is being combined, add a binary feature to the final phrase table, with values of 1 (phrase pair doesn't occur in model) and 2.718 (it does). This indicates which model(s) a phrase pair comes from and can be used during MERT to additionally reward/penalize translation models lowmem: low memory mode: instead of loading target phrase counts / probability (when required), process the original table and its inversion (source and target swapped) incrementally, then merge the two halves. tempdir: temporary directory (for low memory mode). there are a number of further configuration options that you can define, which modify the algorithm for linear interpolation. They have no effect in mode 'counts' recompute_lexweights: don't directly interpolate lexical weights, but interpolate word translation probabilities instead and recompute the lexical weights. default: False normalized: for interpolation of p(x|y): if True, models with p(y)=0 will be ignored, and probability mass will be distributed among models with p(y)>0. If False, missing entries (x,y) are always interpreted as p(x|y)=0. default: False normalize_s_given_t: How to we normalize p(s|t) if 'normalized' is True? Three options: None: don't normalize p(s|t) and lex(s|t) (only p(t|s) and lex(t|s)) t: check if p(t)==0 : advantage: theoretically sound; disadvantage: slower (we need to know if t occcurs in model); favours rare target phrases (relative to default choice) s: check if p(s)==0 : advantage: relevant for task; disadvantage: no true probability distributions default: None normalize-lexical_weights: also normalize lex(s|t) and lex(t|s) if 'normalized' ist True: reason why you might want to disable this: lexical weights suffer less from data sparseness than probabilities. default: True """ self.mode = mode self.output_file = output_file self.lang_src = lang_src self.lang_target = lang_target self.loaded = defaultdict(int) self.output_lexical = output_lexical self.flags = copy.copy(self.flags) self.flags.update(flags) self.flags['i_e2f'] = int(self.flags['i_e2f']) self.flags['i_e2f_lex'] = int(self.flags['i_e2f_lex']) self.flags['i_f2e'] = int(self.flags['i_f2e']) self.flags['i_f2e_lex'] = int(self.flags['i_f2e_lex']) if reference_interface: self.reference_interface = reference_interface(reference_file) if mode not in ['interpolate','loglinear','counts']: sys.stderr.write('Error: mode must be either "interpolate", "loglinear" or "counts"\n') sys.exit(1) models,number_of_features,weights = self._sanity_checks(models,number_of_features,weights) self.weights = weights self.models = models self.model_interface = model_interface(models,number_of_features) if mode == 'interpolate': self.score = score_interpolate elif mode == 'loglinear': self.score = score_loglinear elif mode == 'counts': self.score = score_counts def _sanity_checks(self,models,number_of_features,weights): """check if input arguments make sense (correct number of weights, valid model priorities etc.) is only called on initialization. If you change weights afterwards, better know what you're doing. """ number_of_features = int(number_of_features) for (model,priority) in models: assert(priority in self._priorities) models = [(model,self._priorities[p]) for (model,p) in models] # accept two types of weight declarations: one weight per model, and one weight per model and feature # type one is internally converted to type two: [0.1,0.9] -> [[0.1,0.9],[0.1,0.9],[0.1,0.9],[0.1,0.9]] if weights: if type(weights[0]) == list: assert(len(weights)==number_of_features) for sublist in weights: assert(len(sublist)==len(models)) else: assert(len(models) == len(weights)) weights = [weights for i in range(number_of_features)] else: if self.mode == 'loglinear' or self.mode == 'interpolate': weights = [[1/len(models)]*len(models) for i in range(number_of_features)] elif self.mode == 'counts': weights = [[1]*len(models) for i in range(number_of_features)] sys.stderr.write('Warning: No weights defined: initializing with uniform weights\n') new_weights = normalize_weights(weights,self.mode,self.flags) if weights != new_weights: if self.mode == 'interpolate' or self.mode == 'loglinear': sys.stderr.write('Warning: weights should sum to 1 - ') elif self.mode == 'counts': sys.stderr.write('Warning: normalizing weights so that first model has weight 1 (for features that are recomputed from counts) - ') sys.stderr.write('normalizing to: '+ str(new_weights) +'\n') weights = new_weights return models,number_of_features,weights def _ensure_loaded(self,data): """load data (lexical tables; reference alignment; phrase table), if it isn't already in memory""" if 'lexical' in data: self.model_interface.require_alignment = True if 'reference' in data and not self.loaded['reference']: sys.stderr.write('Loading word pairs from reference set...') self.reference_interface.load_word_pairs(self.lang_src,self.lang_target) sys.stderr.write('done\n') self.loaded['reference'] = 1 if 'lexical' in data and not self.loaded['lexical']: sys.stderr.write('Loading lexical tables...') self.model_interface.load_lexical_tables(self.models,self.mode) sys.stderr.write('done\n') self.loaded['lexical'] = 1 if 'pt-filtered' in data and not self.loaded['pt-filtered']: models_prioritized = [(self.model_interface.open_table(model,'phrase-table'),priority,i) for (model,priority,i) in priority_sort_models(self.models)] for model,priority,i in models_prioritized: sys.stderr.write('Loading phrase table ' + str(i) + ' (only data relevant for reference set)') j = 0 for line in model: if not j % 1000000: sys.stderr.write('...'+str(j)) j += 1 line = line.rstrip().split(b' ||| ') if line[-1].endswith(b' |||'): line[-1] = line[-1][:-4] line.append('') self.model_interface.load_phrase_features(line,priority,i,store='all',mode=self.mode,filter_by=self.reference_interface.word_pairs,filter_by_src=self.reference_interface.word_source,filter_by_target=self.reference_interface.word_target,flags=self.flags) sys.stderr.write(' done\n') self.loaded['pt-filtered'] = 1 if 'lexical-filtered' in data and not self.loaded['lexical-filtered']: e2f_filter, f2e_filter = _get_lexical_filter(self.reference_interface,self.model_interface) sys.stderr.write('Loading lexical tables (only data relevant for reference set)...') self.model_interface.load_lexical_tables(self.models,self.mode,e2f_filter=e2f_filter,f2e_filter=f2e_filter) sys.stderr.write('done\n') self.loaded['lexical-filtered'] = 1 if 'pt-target' in data and not self.loaded['pt-target']: models_prioritized = [(self.model_interface.open_table(model,'phrase-table'),priority,i) for (model,priority,i) in priority_sort_models(self.models)] for model,priority,i in models_prioritized: sys.stderr.write('Loading target information from phrase table ' + str(i)) j = 0 for line in model: if not j % 1000000: sys.stderr.write('...'+str(j)) j += 1 line = line.rstrip().split(b' ||| ') if line[-1].endswith(b' |||'): line[-1] = line[-1][:-4] line.append('') self.model_interface.load_phrase_features(line,priority,i,mode=self.mode,store='target',flags=self.flags) sys.stderr.write(' done\n') self.loaded['pt-target'] = 1 def _inverse_wrapper(self,weights,tempdir=None): """if we want to invert the phrase table to better calcualte p(s|t) and lex(s|t), manage creation, sorting and merging of inverted phrase tables""" sys.stderr.write('Processing first table half\n') models = [(self.model_interface.open_table(model,'phrase-table'),priority,i) for (model,priority,i) in priority_sort_models(self.model_interface.models)] pt_half1 = NamedTemporaryFile(prefix='half1',delete=False,dir=tempdir) self._write_phrasetable(models,pt_half1,weights) pt_half1.seek(0) sys.stderr.write('Inverting tables\n') models = [(self.model_interface.create_inverse(self.model_interface.open_table(model,'phrase-table'),tempdir=tempdir),priority,i) for (model,priority,i) in priority_sort_models(self.model_interface.models)] sys.stderr.write('Processing second table half\n') pt_half2_inverted = NamedTemporaryFile(prefix='half2',delete=False,dir=tempdir) self._write_phrasetable(models,pt_half2_inverted,weights,inverted=True) pt_half2_inverted.close() for model,priority,i in models: model.close() os.remove(model.name) pt_half2 = sort_file(pt_half2_inverted.name,tempdir=tempdir) os.remove(pt_half2_inverted.name) sys.stderr.write('Merging tables: first half: {0} ; second half: {1} ; final table: {2}\n'.format(pt_half1.name,pt_half2.name,self.output_file)) output_object = handle_file(self.output_file,'open',mode='w') self.model_interface.merge(pt_half1,pt_half2,output_object,self.mode) os.remove(pt_half1.name) os.remove(pt_half2.name) handle_file(self.output_file,'close',output_object,mode='w') def _write_phrasetable(self,models,output_object,weights,inverted=False): """Incrementally load phrase tables, calculate score for increment and write it to output_object""" # define which information we need to store from the phrase table # possible flags: 'all', 'target', 'source' and 'pairs' # interpolated models without re-normalization only need 'pairs', otherwise 'all' is the correct choice store_flag = 'all' if self.mode == 'interpolate' and not self.flags['normalized']: store_flag = 'pairs' i = 0 sys.stderr.write('Incrementally loading and processing phrase tables...') for block in self.model_interface.traverse_incrementally('phrase-table',models,self.model_interface.load_phrase_features,store_flag,mode=self.mode,inverted=inverted,lowmem=self.flags['lowmem'],flags=self.flags): for src in sorted(self.model_interface.phrase_pairs, key = lambda x: x + b' |'): for target in sorted(self.model_interface.phrase_pairs[src], key = lambda x: x + b' |'): if not i % 1000000: sys.stderr.write(str(i) + '...') i += 1 features = self.score(weights,src,target,self.model_interface,self.flags) outline = self.model_interface.write_phrase_table(src,target,weights,features,self.mode, self.flags) output_object.write(outline) sys.stderr.write('done\n') def combine_given_weights(self,weights=None): """write a new phrase table, based on existing weights""" if not weights: weights = self.weights data = [] if self.mode == 'counts': data.append('lexical') if not self.flags['lowmem']: data.append('pt-target') elif self.mode == 'interpolate': if self.flags['recompute_lexweights']: data.append('lexical') if self.flags['normalized'] and self.flags['normalize_s_given_t'] == 't' and not self.flags['lowmem']: data.append('pt-target') self._ensure_loaded(data) if self.flags['lowmem'] and (self.mode == 'counts' or self.flags['normalized'] and self.flags['normalize_s_given_t'] == 't'): self._inverse_wrapper(weights,tempdir=self.flags['tempdir']) else: models = [(self.model_interface.open_table(model,'phrase-table'),priority,i) for (model,priority,i) in priority_sort_models(self.model_interface.models)] output_object = handle_file(self.output_file,'open',mode='w') self._write_phrasetable(models,output_object,weights) handle_file(self.output_file,'close',output_object,mode='w') if self.output_lexical: sys.stderr.write('Writing lexical tables\n') self._ensure_loaded(['lexical']) self.model_interface.write_lexical_file('e2f',self.output_lexical,weights[1],self.mode) self.model_interface.write_lexical_file('f2e',self.output_lexical,weights[3],self.mode) def combine_given_tuning_set(self): """write a new phrase table, using the weights that minimize cross-entropy on a tuning set""" data = ['reference','pt-filtered'] if self.mode == 'counts' or (self.mode == 'interpolate' and self.flags['recompute_lexweights']): data.append('lexical-filtered') self._ensure_loaded(data) best_weights,best_cross_entropy = optimize_cross_entropy(self.model_interface,self.reference_interface,self.weights,self.score,self.mode,self.flags) sys.stderr.write('Best weights: ' + str(best_weights) + '\n') sys.stderr.write('Cross entropies: ' + str(best_cross_entropy) + '\n') sys.stderr.write('Executing action combine_given_weights with -w "{0}"\n'.format('; '.join([', '.join(str(w) for w in item) for item in best_weights]))) self.loaded['pt-filtered'] = False # phrase table will be overwritten self.combine_given_weights(weights=best_weights) def combine_reordering_tables(self,weights=None): """write a new reordering table, based on existing weights.""" if not weights: weights = self.weights data = [] if self.mode != 'interpolate': sys.stderr.write('Error: only linear interpolation is supported for reordering model combination') output_object = handle_file(self.output_file,'open',mode='w') models = [(self.model_interface.open_table(model,'reordering-table'),priority,i) for (model,priority,i) in priority_sort_models(self.models)] i = 0 sys.stderr.write('Incrementally loading and processing phrase tables...') for block in self.model_interface.traverse_incrementally('reordering-table',models,self.model_interface.load_reordering_probabilities,'pairs',mode=self.mode,lowmem=self.flags['lowmem'],flags=self.flags): for src in sorted(self.model_interface.reordering_pairs): for target in sorted(self.model_interface.reordering_pairs[src]): if not i % 1000000: sys.stderr.write(str(i) + '...') i += 1 features = score_interpolate_reordering(weights,src,target,self.model_interface) outline = self.model_interface.write_reordering_table(src,target,features) output_object.write(outline) sys.stderr.write('done\n') handle_file(self.output_file,'close',output_object,mode='w') def compare_cross_entropies(self): """print cross-entropies for each model/feature, using the intersection of phrase pairs. analysis tool. """ self.flags['compare_cross-entropies'] = True data = ['reference','pt-filtered'] if self.mode == 'counts' or (self.mode == 'interpolate' and self.flags['recompute_lexweights']): data.append('lexical-filtered') self._ensure_loaded(data) results, (intersection,total_pairs,oov2) = cross_entropy(self.model_interface,self.reference_interface,self.weights,self.score,self.mode,self.flags) padding = 90 num_features = self.model_interface.number_of_features print('\nResults of model comparison\n') print('{0:<{padding}}: {1}'.format('phrase pairs in reference (tokens)',total_pairs, padding=padding)) print('{0:<{padding}}: {1}'.format('phrase pairs in model intersection (tokens)',intersection, padding=padding)) print('{0:<{padding}}: {1}\n'.format('phrase pairs in model union (tokens)',total_pairs-oov2, padding=padding)) for i,data in enumerate(results): cross_entropies = data[:num_features] (other_translations,oov,ignored,n,total_pairs) = data[num_features:] print('model ' +str(i)) for j in range(num_features): print('{0:<{padding}}: {1}'.format('cross-entropy for feature {0}'.format(j), cross_entropies[j], padding=padding)) print('{0:<{padding}}: {1}'.format('phrase pairs in model (tokens)', n+ignored, padding=padding)) print('{0:<{padding}}: {1}'.format('phrase pairs in model, but not in intersection (tokens)', ignored, padding=padding)) print('{0:<{padding}}: {1}'.format('phrase pairs in union, but not in model (but source phrase is) (tokens)', other_translations, padding=padding)) print('{0:<{padding}}: {1}\n'.format('phrase pairs in union, but source phrase not in model (tokens)', oov, padding=padding)) self.flags['compare_cross-entropies'] = False return results, (intersection,total_pairs,oov2) def compute_cross_entropy(self): """return cross-entropy for a tuning set, a set of models and a set of weights. analysis tool. """ data = ['reference','pt-filtered'] if self.mode == 'counts' or (self.mode == 'interpolate' and self.flags['recompute_lexweights']): data.append('lexical-filtered') self._ensure_loaded(data) current_cross_entropy = cross_entropy(self.model_interface,self.reference_interface,self.weights,self.score,self.mode,self.flags) sys.stderr.write('Cross entropy: ' + str(current_cross_entropy) + '\n') return current_cross_entropy def return_best_cross_entropy(self): """return the set of weights and cross-entropy that is optimal for a tuning set and a set of models.""" data = ['reference','pt-filtered'] if self.mode == 'counts' or (self.mode == 'interpolate' and self.flags['recompute_lexweights']): data.append('lexical-filtered') self._ensure_loaded(data) best_weights,best_cross_entropy = optimize_cross_entropy(self.model_interface,self.reference_interface,self.weights,self.score,self.mode,self.flags) sys.stderr.write('Best weights: ' + str(best_weights) + '\n') sys.stderr.write('Cross entropies: ' + str(best_cross_entropy) + '\n') sys.stderr.write('You can apply these weights with the action combine_given_weights and the option -w "{0}"\n'.format('; '.join([', '.join(str(w) for w in item) for item in best_weights]))) return best_weights,best_cross_entropy def test(): """test (and illustrate) the functionality of the program based on two test phrase tables and a small reference set,""" # linear interpolation of two models, with fixed weights. Output uses vocabulary of model1 (since model2 is supplementary) # command line: (currently not possible to define supplementary models through command line) sys.stderr.write('Regression test 1\n') Combiner = Combine_TMs([[os.path.join('test','model1'),'primary'],[os.path.join('test','model2'),'supplementary']],[0.5,0.5],os.path.join('test','phrase-table_test1')) Combiner.combine_given_weights() # linear interpolation of two models, with fixed weights (but different for each feature). # command line: python tmcombine.py combine_given_weights test/model1 test/model2 -w "0.1,0.9;0.1,1;0.2,0.8;0.5,0.5" -o test/phrase-table_test2 sys.stderr.write('Regression test 2\n') Combiner = Combine_TMs([[os.path.join('test','model1'),'primary'],[os.path.join('test','model2'),'primary']],[[0.1,0.9],[0.1,1],[0.2,0.8],[0.5,0.5]],os.path.join('test','phrase-table_test2')) Combiner.combine_given_weights() # count-based combination of two models, with fixed weights # command line: python tmcombine.py combine_given_weights test/model1 test/model2 -w "0.1,0.9;0.1,1;0.2,0.8;0.5,0.5" -o test/phrase-table_test3 -m counts sys.stderr.write('Regression test 3\n') Combiner = Combine_TMs([[os.path.join('test','model1'),'primary'],[os.path.join('test','model2'),'primary']],[[0.1,0.9],[0.1,1],[0.2,0.8],[0.5,0.5]],os.path.join('test','phrase-table_test3'),mode='counts') Combiner.combine_given_weights() # output phrase table should be identical to model1 # command line: python tmcombine.py combine_given_weights test/model1 -w 1 -o test/phrase-table_test4 -m counts sys.stderr.write('Regression test 4\n') Combiner = Combine_TMs([[os.path.join('test','model1'),'primary']],[1],os.path.join('test','phrase-table_test4'),mode='counts') Combiner.combine_given_weights() # count-based combination of two models with weights set through perplexity minimization # command line: python tmcombine.py combine_given_tuning_set test/model1 test/model2 -o test/phrase-table_test5 -m counts -r test/extract sys.stderr.write('Regression test 5\n') Combiner = Combine_TMs([[os.path.join('test','model1'),'primary'],[os.path.join('test','model2'),'primary']],output_file=os.path.join('test','phrase-table_test5'),mode='counts',reference_file='test/extract') Combiner.combine_given_tuning_set() # loglinear combination of two models with fixed weights # command line: python tmcombine.py combine_given_weights test/model1 test/model2 -w 0.1,0.9 -o test/phrase-table_test6 -m loglinear sys.stderr.write('Regression test 6\n') Combiner = Combine_TMs([[os.path.join('test','model1'),'primary'],[os.path.join('test','model2'),'primary']],weights=[0.1,0.9],output_file=os.path.join('test','phrase-table_test6'),mode='loglinear') Combiner.combine_given_weights() # cross-entropy analysis of two models through a reference set # command line: python tmcombine.py compare_cross_entropies test/model1 test/model2 -m counts -r test/extract sys.stderr.write('Regression test 7\n') Combiner = Combine_TMs([[os.path.join('test','model1'),'primary'],[os.path.join('test','model2'),'primary']],mode='counts',reference_file='test/extract') f = open(os.path.join('test','phrase-table_test7'),'w') f.write(str(Combiner.compare_cross_entropies())) f.close() # maximum a posteriori combination of two models (Bacchiani et al. 2004; Foster et al. 2010) with weights set through cross-entropy minimization # command line: (currently not possible through command line) sys.stderr.write('Regression test 8\n') Combiner = Combine_TMs([[os.path.join('test','model1'),'primary'],[os.path.join('test','model2'),'map']],output_file=os.path.join('test','phrase-table_test8'),mode='counts',reference_file='test/extract') Combiner.combine_given_tuning_set() # count-based combination of two non-default models, with fixed weights. Same as test 3, but with the standard features moved back # command line: python tmcombine.py combine_given_weights test/model3 test/model4 -w "0.5,0.5;0.5,0.5;0.5,0.5;0.5,0.5;0.1,0.9;0.1,1;0.2,0.8;0.5,0.5" -o test/phrase-table_test9 -m counts --number_of_features 8 --i_e2f 4 --i_e2f_lex 5 --i_f2e 6 --i_f2e_lex 7 -r test/extract sys.stderr.write('Regression test 9\n') Combiner = Combine_TMs([[os.path.join('test','model3'),'primary'],[os.path.join('test','model4'),'primary']],[[0.5,0.5],[0.5,0.5],[0.5,0.5],[0.5,0.5],[0.1,0.9],[0.1,1],[0.2,0.8],[0.5,0.5]],os.path.join('test','phrase-table_test9'),mode='counts',number_of_features=8,i_e2f=4,i_e2f_lex=5,i_f2e=6,i_f2e_lex=7) Combiner.combine_given_weights() # count-based combination of two non-default models, with fixed weights. Same as test 5, but with the standard features moved back # command line: python tmcombine.py combine_given_tuning_set test/model3 test/model4 -o test/phrase-table_test10 -m counts --number_of_features 8 --i_e2f 4 --i_e2f_lex 5 --i_f2e 6 --i_f2e_lex 7 -r test/extract sys.stderr.write('Regression test 10\n') Combiner = Combine_TMs([[os.path.join('test','model3'),'primary'],[os.path.join('test','model4'),'primary']],output_file=os.path.join('test','phrase-table_test10'),mode='counts',number_of_features=8,i_e2f=4,i_e2f_lex=5,i_f2e=6,i_f2e_lex=7,reference_file='test/extract') Combiner.combine_given_tuning_set() # count-based combination of two hierarchical models, with fixed weights. Same as test 3, but with hierarchical models # command line: python tmcombine.py combine_given_weights test/model5 test/model6 -w "0.1,0.9;0.1,1;0.2,0.8;0.5,0.5" -o test/phrase-table_test11 -m counts sys.stderr.write('Regression test 11\n') Combiner = Combine_TMs([[os.path.join('test','model5'),'primary'],[os.path.join('test','model6'),'primary']],[[0.1,0.9],[0.1,1],[0.2,0.8],[0.5,0.5]],os.path.join('test','phrase-table_test11'),mode='counts') Combiner.combine_given_weights() #convert weight vector passed as a command line argument class to_list(argparse.Action): def __call__(self, parser, namespace, weights, option_string=None): if ';' in weights: values = [[float(x) for x in vector.split(',')] for vector in weights.split(';')] else: values = [float(x) for x in weights.split(',')] setattr(namespace, self.dest, values) def parse_command_line(): parser = argparse.ArgumentParser(description='Combine translation models. Check DOCSTRING of the class Combine_TMs() and its methods for a more in-depth documentation and additional configuration options not available through the command line. The function test() shows examples.') group1 = parser.add_argument_group('Main options') group2 = parser.add_argument_group('More model combination options') group1.add_argument('action', metavar='ACTION', choices=["combine_given_weights","combine_given_tuning_set","combine_reordering_tables","compute_cross_entropy","return_best_cross_entropy","compare_cross_entropies"], help='What you want to do with the models. One of %(choices)s.') group1.add_argument('model', metavar='DIRECTORY', nargs='+', help='Model directory. Assumes default Moses structure (i.e. path to phrase table and lexical tables).') group1.add_argument('-w', '--weights', dest='weights', action=to_list, default=None, help='weight vector. Format 1: single vector, one weight per model. Example: \"0.1,0.9\" ; format 2: one vector per feature, one weight per model: \"0.1,0.9;0.5,0.5;0.4,0.6;0.2,0.8\"') group1.add_argument('-m', '--mode', type=str, default="interpolate", choices=["counts","interpolate","loglinear"], help='basic mixture-model algorithm. Default: %(default)s. Note: depending on mode and additional configuration, additional statistics are needed. Check docstring documentation of Combine_TMs() for more info.') group1.add_argument('-r', '--reference', type=str, default=None, help='File containing reference phrase pairs for cross-entropy calculation. Default interface expects \'path/model/extract.gz\' that is produced by training a model on the reference (i.e. development) corpus.') group1.add_argument('-o', '--output', type=str, default="-", help='Output file (phrase table). If not specified, model is written to standard output.') group1.add_argument('--output-lexical', type=str, default=None, help=('Not only create a combined phrase table, but also combined lexical tables. Writes to OUTPUT_LEXICAL.e2f and OUTPUT_LEXICAL.f2e, or OUTPUT_LEXICAL.counts.e2f in mode \'counts\'.')) group1.add_argument('--lowmem', action="store_true", help=('Low memory mode: requires two passes (and sorting in between) to combine a phrase table, but loads less data into memory. Only relevant for mode "counts" and some configurations of mode "interpolate".')) group1.add_argument('--tempdir', type=str, default=None, help=('Temporary directory in --lowmem mode.')) group2.add_argument('--i_e2f', type=int, default=0, metavar='N', help=('Index of p(f|e) (relevant for mode counts if phrase table has custom feature order). (default: %(default)s)')) group2.add_argument('--i_e2f_lex', type=int, default=1, metavar='N', help=('Index of lex(f|e) (relevant for mode counts or with option recompute_lexweights if phrase table has custom feature order). (default: %(default)s)')) group2.add_argument('--i_f2e', type=int, default=2, metavar='N', help=('Index of p(e|f) (relevant for mode counts if phrase table has custom feature order). (default: %(default)s)')) group2.add_argument('--i_f2e_lex', type=int, default=3, metavar='N', help=('Index of lex(e|f) (relevant for mode counts or with option recompute_lexweights if phrase table has custom feature order). (default: %(default)s)')) group2.add_argument('--number_of_features', type=int, default=4, metavar='N', help=('Combine models with N + 1 features (last feature is constant phrase penalty). (default: %(default)s)')) group2.add_argument('--normalized', action="store_true", help=('for each phrase pair x,y: ignore models with p(y)=0, and distribute probability mass among models with p(y)>0. (default: missing entries (x,y) are always interpreted as p(x|y)=0). Only relevant in mode "interpolate".')) group2.add_argument('--write-phrase-penalty', action="store_true", help=("Include phrase penalty in phrase table")) group2.add_argument('--recompute_lexweights', action="store_true", help=('don\'t directly interpolate lexical weights, but interpolate word translation probabilities instead and recompute the lexical weights. Only relevant in mode "interpolate".')) return parser.parse_args() if __name__ == "__main__": if len(sys.argv) < 2: sys.stderr.write("no command specified. use option -h for usage instructions\n") elif sys.argv[1] == "test": test() else: args = parse_command_line() #initialize combiner = Combine_TMs([(m,'primary') for m in args.model], weights=args.weights, mode=args.mode, output_file=args.output, reference_file=args.reference, output_lexical=args.output_lexical, lowmem=args.lowmem, normalized=args.normalized, recompute_lexweights=args.recompute_lexweights, tempdir=args.tempdir, number_of_features=args.number_of_features, i_e2f=args.i_e2f, i_e2f_lex=args.i_e2f_lex, i_f2e=args.i_f2e, i_f2e_lex=args.i_f2e_lex, write_phrase_penalty=args.write_phrase_penalty) # execute right method f_string = "combiner."+args.action+'()' exec(f_string)