tmpdir option and bugfix to reordering

This commit is contained in:
Rico Sennrich 2012-02-27 14:11:47 +01:00
parent d03a598b11
commit e9d960d73e

View File

@ -196,7 +196,7 @@ class Moses():
def traverse_incrementally(self,table,models,load_lines,store_flag,inverted=False):
def traverse_incrementally(self,table,models,load_lines,store_flag,inverted=False,lowmem=False):
"""hack-ish way to find common phrase pairs in multiple models in one traversal without storing it all in memory
relies on alphabetical sorting of phrase table.
"""
@ -209,7 +209,9 @@ class Moses():
self.phrase_pairs = defaultdict(lambda: defaultdict(lambda: [[[0]*len(self.models) for i in range(self.number_of_features)],[]]))
self.reordering_pairs = defaultdict(lambda: defaultdict(lambda: [[0]*len(self.models) for i in range(self.number_of_features)]))
self.phrase_source = defaultdict(lambda: [0]*len(self.models))
self.phrase_target = defaultdict(lambda: [0]*len(self.models))
if lowmem:
self.phrase_target = defaultdict(lambda: [0]*len(self.models))
for model,priority,i in models:
@ -451,10 +453,10 @@ class Moses():
return line
def create_inverse(self,fobj):
def create_inverse(self,fobj,tempdir=None):
"""swap source and target phrase in the phrase table, and then sort (by target phrase)"""
inverse = NamedTemporaryFile(prefix='inv_unsorted',delete=False)
inverse = NamedTemporaryFile(prefix='inv_unsorted',delete=False,dir=tempdir)
swap = re.compile(b'(.+?) \|\|\| (.+?) \|\|\|')
# just swap source and target phrase, and leave order of scores etc. intact.
@ -463,7 +465,7 @@ class Moses():
inverse.write(swap.sub(b'\\2 ||| \\1 |||',line,1))
inverse.close()
inverse_sorted = sort_file(inverse.name)
inverse_sorted = sort_file(inverse.name,tempdir=tempdir)
os.remove(inverse.name)
return inverse_sorted
@ -1254,14 +1256,14 @@ def handle_file(filename,action,fileobj=None,mode='r'):
fileobj.close()
def sort_file(filename):
def sort_file(filename,tempdir=None):
"""Sort a file and return temporary file"""
cmd = ['sort', filename]
env = {}
env['LC_ALL'] = 'C'
outfile = NamedTemporaryFile(delete=False)
outfile = NamedTemporaryFile(delete=False,dir=tempdir)
sys.stderr.write('LC_ALL=C ' + ' '.join(cmd) + ' > ' + outfile.name + '\n')
p = Popen(cmd,env=env,stdout=outfile.file)
p.wait()
@ -1344,6 +1346,8 @@ class Combine_TMs():
lowmem: low memory mode: instead of loading target phrase counts / probability (when required), process the original table and its inversion (source and target swapped) incrementally, then merge the two halves.
tempdir: temporary directory (for low memory mode).
there are a number of further configuration options that you can define, which modify the algorithm for linear interpolation. They have no effect in mode 'counts'
recompute_lexweights: don't directly interpolate lexical weights, but interpolate word translation probabilities instead and recompute the lexical weights.
@ -1507,25 +1511,25 @@ class Combine_TMs():
self.loaded['pt-target'] = 1
def _inverse_wrapper(self,weights):
def _inverse_wrapper(self,weights,tempdir=None):
"""if we want to invert the phrase table to better calcualte p(s|t) and lex(s|t), manage creation, sorting and merging of inverted phrase tables"""
sys.stderr.write('Processing first table half\n')
models = [(self.model_interface.open_table(model,'phrase-table'),priority,i) for (model,priority,i) in priority_sort_models(self.model_interface.models)]
pt_half1 = NamedTemporaryFile(prefix='half1',delete=False)
pt_half1 = NamedTemporaryFile(prefix='half1',delete=False,dir=tempdir)
self._write_phrasetable(models,pt_half1,weights)
pt_half1.seek(0)
sys.stderr.write('Inverting tables\n')
models = [(self.model_interface.create_inverse(self.model_interface.open_table(model,'phrase-table')),priority,i) for (model,priority,i) in priority_sort_models(self.model_interface.models)]
models = [(self.model_interface.create_inverse(self.model_interface.open_table(model,'phrase-table'),tempdir=tempdir),priority,i) for (model,priority,i) in priority_sort_models(self.model_interface.models)]
sys.stderr.write('Processing second table half\n')
pt_half2_inverted = NamedTemporaryFile(prefix='half2',delete=False)
pt_half2_inverted = NamedTemporaryFile(prefix='half2',delete=False,dir=tempdir)
self._write_phrasetable(models,pt_half2_inverted,weights,inverted=True)
pt_half2_inverted.close()
for model,priority,i in models:
model.close()
os.remove(model.name)
pt_half2 = sort_file(pt_half2_inverted.name)
pt_half2 = sort_file(pt_half2_inverted.name,tempdir=tempdir)
os.remove(pt_half2_inverted.name)
sys.stderr.write('Merging tables: first half: {0} ; second half: {1} ; final table: {2}\n'.format(pt_half1.name,pt_half2.name,self.output_file))
@ -1549,7 +1553,7 @@ class Combine_TMs():
i = 0
sys.stderr.write('Incrementally loading and processing phrase tables...')
for block in self.model_interface.traverse_incrementally('phrase-table',models,self.load_lines,store_flag,inverted=inverted):
for block in self.model_interface.traverse_incrementally('phrase-table',models,self.load_lines,store_flag,inverted=inverted,lowmem=self.flags['lowmem']):
for src in sorted(self.model_interface.phrase_pairs, key = lambda x: x + b' |'):
for target in sorted(self.model_interface.phrase_pairs[src], key = lambda x: x + b' |'):
@ -1586,7 +1590,7 @@ class Combine_TMs():
self._ensure_loaded(data)
if self.flags['lowmem'] and (self.mode == 'counts' or self.flags['normalized'] and self.flags['normalize_s_given_t'] == 't'):
self._inverse_wrapper(weights)
self._inverse_wrapper(weights,tempdir=self.flags['tempdir'])
else:
models = [(self.model_interface.open_table(model,'phrase-table'),priority,i) for (model,priority,i) in priority_sort_models(self.model_interface.models)]
output_object = handle_file(self.output_file,'open',mode='w')
@ -1631,12 +1635,12 @@ class Combine_TMs():
sys.stderr.write('Error: only linear interpolation is supported for reordering model combination')
output_object = handle_file(self.output_file,'open',mode='w')
models = [(self.open_table(model,table),priority,i) for (model,priority,i) in priority_sort_models(self.models)]
models = [(self.model_interface.open_table(model,'reordering-table'),priority,i) for (model,priority,i) in priority_sort_models(self.models)]
i = 0
sys.stderr.write('Incrementally loading and processing phrase tables...')
for block in self.model_interface.traverse_incrementally('reordering-table',models,self.model_interface.load_reordering_probabilities,'pairs'):
for block in self.model_interface.traverse_incrementally('reordering-table',models,self.model_interface.load_reordering_probabilities,'pairs',lowmem=self.flags['lowmem']):
for src in sorted(self.model_interface.reordering_pairs):
for target in sorted(self.model_interface.reordering_pairs[src]):
@ -1829,6 +1833,11 @@ def parse_command_line():
parser.add_argument('--recompute_lexweights', action="store_true",
help=('don\'t directly interpolate lexical weights, but interpolate word translation probabilities instead and recompute the lexical weights. Only relevant in mode "interpolate".'))
parser.add_argument('--tempdir', type=str,
default=None,
help=('Temporary directory in --lowmem mode.'))
return parser.parse_args()
if __name__ == "__main__":
@ -1842,7 +1851,7 @@ if __name__ == "__main__":
else:
args = parse_command_line()
#initialize
combiner = Combine_TMs([(m,'primary') for m in args.model],weights=args.weights,mode=args.mode,output_file=args.output,reference_file=args.reference,output_lexical=args.output_lexical,lowmem=args.lowmem,normalized=args.normalized,recompute_lexweights=args.recompute_lexweights)
combiner = Combine_TMs([(m,'primary') for m in args.model],weights=args.weights,mode=args.mode,output_file=args.output,reference_file=args.reference,output_lexical=args.output_lexical,lowmem=args.lowmem,normalized=args.normalized,recompute_lexweights=args.recompute_lexweights,tempdir=args.tempdir)
# execute right method
f_string = "combiner."+args.action+'()'
exec(f_string)