mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2025-01-04 01:45:52 +03:00
tmpdir option and bugfix to reordering
This commit is contained in:
parent
d03a598b11
commit
e9d960d73e
@ -196,7 +196,7 @@ class Moses():
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
def traverse_incrementally(self,table,models,load_lines,store_flag,inverted=False):
|
def traverse_incrementally(self,table,models,load_lines,store_flag,inverted=False,lowmem=False):
|
||||||
"""hack-ish way to find common phrase pairs in multiple models in one traversal without storing it all in memory
|
"""hack-ish way to find common phrase pairs in multiple models in one traversal without storing it all in memory
|
||||||
relies on alphabetical sorting of phrase table.
|
relies on alphabetical sorting of phrase table.
|
||||||
"""
|
"""
|
||||||
@ -209,7 +209,9 @@ class Moses():
|
|||||||
self.phrase_pairs = defaultdict(lambda: defaultdict(lambda: [[[0]*len(self.models) for i in range(self.number_of_features)],[]]))
|
self.phrase_pairs = defaultdict(lambda: defaultdict(lambda: [[[0]*len(self.models) for i in range(self.number_of_features)],[]]))
|
||||||
self.reordering_pairs = defaultdict(lambda: defaultdict(lambda: [[0]*len(self.models) for i in range(self.number_of_features)]))
|
self.reordering_pairs = defaultdict(lambda: defaultdict(lambda: [[0]*len(self.models) for i in range(self.number_of_features)]))
|
||||||
self.phrase_source = defaultdict(lambda: [0]*len(self.models))
|
self.phrase_source = defaultdict(lambda: [0]*len(self.models))
|
||||||
self.phrase_target = defaultdict(lambda: [0]*len(self.models))
|
|
||||||
|
if lowmem:
|
||||||
|
self.phrase_target = defaultdict(lambda: [0]*len(self.models))
|
||||||
|
|
||||||
for model,priority,i in models:
|
for model,priority,i in models:
|
||||||
|
|
||||||
@ -451,10 +453,10 @@ class Moses():
|
|||||||
return line
|
return line
|
||||||
|
|
||||||
|
|
||||||
def create_inverse(self,fobj):
|
def create_inverse(self,fobj,tempdir=None):
|
||||||
"""swap source and target phrase in the phrase table, and then sort (by target phrase)"""
|
"""swap source and target phrase in the phrase table, and then sort (by target phrase)"""
|
||||||
|
|
||||||
inverse = NamedTemporaryFile(prefix='inv_unsorted',delete=False)
|
inverse = NamedTemporaryFile(prefix='inv_unsorted',delete=False,dir=tempdir)
|
||||||
swap = re.compile(b'(.+?) \|\|\| (.+?) \|\|\|')
|
swap = re.compile(b'(.+?) \|\|\| (.+?) \|\|\|')
|
||||||
|
|
||||||
# just swap source and target phrase, and leave order of scores etc. intact.
|
# just swap source and target phrase, and leave order of scores etc. intact.
|
||||||
@ -463,7 +465,7 @@ class Moses():
|
|||||||
inverse.write(swap.sub(b'\\2 ||| \\1 |||',line,1))
|
inverse.write(swap.sub(b'\\2 ||| \\1 |||',line,1))
|
||||||
inverse.close()
|
inverse.close()
|
||||||
|
|
||||||
inverse_sorted = sort_file(inverse.name)
|
inverse_sorted = sort_file(inverse.name,tempdir=tempdir)
|
||||||
os.remove(inverse.name)
|
os.remove(inverse.name)
|
||||||
|
|
||||||
return inverse_sorted
|
return inverse_sorted
|
||||||
@ -1254,14 +1256,14 @@ def handle_file(filename,action,fileobj=None,mode='r'):
|
|||||||
fileobj.close()
|
fileobj.close()
|
||||||
|
|
||||||
|
|
||||||
def sort_file(filename):
|
def sort_file(filename,tempdir=None):
|
||||||
"""Sort a file and return temporary file"""
|
"""Sort a file and return temporary file"""
|
||||||
|
|
||||||
cmd = ['sort', filename]
|
cmd = ['sort', filename]
|
||||||
env = {}
|
env = {}
|
||||||
env['LC_ALL'] = 'C'
|
env['LC_ALL'] = 'C'
|
||||||
|
|
||||||
outfile = NamedTemporaryFile(delete=False)
|
outfile = NamedTemporaryFile(delete=False,dir=tempdir)
|
||||||
sys.stderr.write('LC_ALL=C ' + ' '.join(cmd) + ' > ' + outfile.name + '\n')
|
sys.stderr.write('LC_ALL=C ' + ' '.join(cmd) + ' > ' + outfile.name + '\n')
|
||||||
p = Popen(cmd,env=env,stdout=outfile.file)
|
p = Popen(cmd,env=env,stdout=outfile.file)
|
||||||
p.wait()
|
p.wait()
|
||||||
@ -1344,6 +1346,8 @@ class Combine_TMs():
|
|||||||
|
|
||||||
lowmem: low memory mode: instead of loading target phrase counts / probability (when required), process the original table and its inversion (source and target swapped) incrementally, then merge the two halves.
|
lowmem: low memory mode: instead of loading target phrase counts / probability (when required), process the original table and its inversion (source and target swapped) incrementally, then merge the two halves.
|
||||||
|
|
||||||
|
tempdir: temporary directory (for low memory mode).
|
||||||
|
|
||||||
there are a number of further configuration options that you can define, which modify the algorithm for linear interpolation. They have no effect in mode 'counts'
|
there are a number of further configuration options that you can define, which modify the algorithm for linear interpolation. They have no effect in mode 'counts'
|
||||||
|
|
||||||
recompute_lexweights: don't directly interpolate lexical weights, but interpolate word translation probabilities instead and recompute the lexical weights.
|
recompute_lexweights: don't directly interpolate lexical weights, but interpolate word translation probabilities instead and recompute the lexical weights.
|
||||||
@ -1507,25 +1511,25 @@ class Combine_TMs():
|
|||||||
self.loaded['pt-target'] = 1
|
self.loaded['pt-target'] = 1
|
||||||
|
|
||||||
|
|
||||||
def _inverse_wrapper(self,weights):
|
def _inverse_wrapper(self,weights,tempdir=None):
|
||||||
"""if we want to invert the phrase table to better calcualte p(s|t) and lex(s|t), manage creation, sorting and merging of inverted phrase tables"""
|
"""if we want to invert the phrase table to better calcualte p(s|t) and lex(s|t), manage creation, sorting and merging of inverted phrase tables"""
|
||||||
|
|
||||||
sys.stderr.write('Processing first table half\n')
|
sys.stderr.write('Processing first table half\n')
|
||||||
models = [(self.model_interface.open_table(model,'phrase-table'),priority,i) for (model,priority,i) in priority_sort_models(self.model_interface.models)]
|
models = [(self.model_interface.open_table(model,'phrase-table'),priority,i) for (model,priority,i) in priority_sort_models(self.model_interface.models)]
|
||||||
pt_half1 = NamedTemporaryFile(prefix='half1',delete=False)
|
pt_half1 = NamedTemporaryFile(prefix='half1',delete=False,dir=tempdir)
|
||||||
self._write_phrasetable(models,pt_half1,weights)
|
self._write_phrasetable(models,pt_half1,weights)
|
||||||
pt_half1.seek(0)
|
pt_half1.seek(0)
|
||||||
|
|
||||||
sys.stderr.write('Inverting tables\n')
|
sys.stderr.write('Inverting tables\n')
|
||||||
models = [(self.model_interface.create_inverse(self.model_interface.open_table(model,'phrase-table')),priority,i) for (model,priority,i) in priority_sort_models(self.model_interface.models)]
|
models = [(self.model_interface.create_inverse(self.model_interface.open_table(model,'phrase-table'),tempdir=tempdir),priority,i) for (model,priority,i) in priority_sort_models(self.model_interface.models)]
|
||||||
sys.stderr.write('Processing second table half\n')
|
sys.stderr.write('Processing second table half\n')
|
||||||
pt_half2_inverted = NamedTemporaryFile(prefix='half2',delete=False)
|
pt_half2_inverted = NamedTemporaryFile(prefix='half2',delete=False,dir=tempdir)
|
||||||
self._write_phrasetable(models,pt_half2_inverted,weights,inverted=True)
|
self._write_phrasetable(models,pt_half2_inverted,weights,inverted=True)
|
||||||
pt_half2_inverted.close()
|
pt_half2_inverted.close()
|
||||||
for model,priority,i in models:
|
for model,priority,i in models:
|
||||||
model.close()
|
model.close()
|
||||||
os.remove(model.name)
|
os.remove(model.name)
|
||||||
pt_half2 = sort_file(pt_half2_inverted.name)
|
pt_half2 = sort_file(pt_half2_inverted.name,tempdir=tempdir)
|
||||||
os.remove(pt_half2_inverted.name)
|
os.remove(pt_half2_inverted.name)
|
||||||
|
|
||||||
sys.stderr.write('Merging tables: first half: {0} ; second half: {1} ; final table: {2}\n'.format(pt_half1.name,pt_half2.name,self.output_file))
|
sys.stderr.write('Merging tables: first half: {0} ; second half: {1} ; final table: {2}\n'.format(pt_half1.name,pt_half2.name,self.output_file))
|
||||||
@ -1549,7 +1553,7 @@ class Combine_TMs():
|
|||||||
|
|
||||||
i = 0
|
i = 0
|
||||||
sys.stderr.write('Incrementally loading and processing phrase tables...')
|
sys.stderr.write('Incrementally loading and processing phrase tables...')
|
||||||
for block in self.model_interface.traverse_incrementally('phrase-table',models,self.load_lines,store_flag,inverted=inverted):
|
for block in self.model_interface.traverse_incrementally('phrase-table',models,self.load_lines,store_flag,inverted=inverted,lowmem=self.flags['lowmem']):
|
||||||
|
|
||||||
for src in sorted(self.model_interface.phrase_pairs, key = lambda x: x + b' |'):
|
for src in sorted(self.model_interface.phrase_pairs, key = lambda x: x + b' |'):
|
||||||
for target in sorted(self.model_interface.phrase_pairs[src], key = lambda x: x + b' |'):
|
for target in sorted(self.model_interface.phrase_pairs[src], key = lambda x: x + b' |'):
|
||||||
@ -1586,7 +1590,7 @@ class Combine_TMs():
|
|||||||
self._ensure_loaded(data)
|
self._ensure_loaded(data)
|
||||||
|
|
||||||
if self.flags['lowmem'] and (self.mode == 'counts' or self.flags['normalized'] and self.flags['normalize_s_given_t'] == 't'):
|
if self.flags['lowmem'] and (self.mode == 'counts' or self.flags['normalized'] and self.flags['normalize_s_given_t'] == 't'):
|
||||||
self._inverse_wrapper(weights)
|
self._inverse_wrapper(weights,tempdir=self.flags['tempdir'])
|
||||||
else:
|
else:
|
||||||
models = [(self.model_interface.open_table(model,'phrase-table'),priority,i) for (model,priority,i) in priority_sort_models(self.model_interface.models)]
|
models = [(self.model_interface.open_table(model,'phrase-table'),priority,i) for (model,priority,i) in priority_sort_models(self.model_interface.models)]
|
||||||
output_object = handle_file(self.output_file,'open',mode='w')
|
output_object = handle_file(self.output_file,'open',mode='w')
|
||||||
@ -1631,12 +1635,12 @@ class Combine_TMs():
|
|||||||
sys.stderr.write('Error: only linear interpolation is supported for reordering model combination')
|
sys.stderr.write('Error: only linear interpolation is supported for reordering model combination')
|
||||||
|
|
||||||
output_object = handle_file(self.output_file,'open',mode='w')
|
output_object = handle_file(self.output_file,'open',mode='w')
|
||||||
models = [(self.open_table(model,table),priority,i) for (model,priority,i) in priority_sort_models(self.models)]
|
models = [(self.model_interface.open_table(model,'reordering-table'),priority,i) for (model,priority,i) in priority_sort_models(self.models)]
|
||||||
|
|
||||||
i = 0
|
i = 0
|
||||||
|
|
||||||
sys.stderr.write('Incrementally loading and processing phrase tables...')
|
sys.stderr.write('Incrementally loading and processing phrase tables...')
|
||||||
for block in self.model_interface.traverse_incrementally('reordering-table',models,self.model_interface.load_reordering_probabilities,'pairs'):
|
for block in self.model_interface.traverse_incrementally('reordering-table',models,self.model_interface.load_reordering_probabilities,'pairs',lowmem=self.flags['lowmem']):
|
||||||
|
|
||||||
for src in sorted(self.model_interface.reordering_pairs):
|
for src in sorted(self.model_interface.reordering_pairs):
|
||||||
for target in sorted(self.model_interface.reordering_pairs[src]):
|
for target in sorted(self.model_interface.reordering_pairs[src]):
|
||||||
@ -1829,6 +1833,11 @@ def parse_command_line():
|
|||||||
parser.add_argument('--recompute_lexweights', action="store_true",
|
parser.add_argument('--recompute_lexweights', action="store_true",
|
||||||
help=('don\'t directly interpolate lexical weights, but interpolate word translation probabilities instead and recompute the lexical weights. Only relevant in mode "interpolate".'))
|
help=('don\'t directly interpolate lexical weights, but interpolate word translation probabilities instead and recompute the lexical weights. Only relevant in mode "interpolate".'))
|
||||||
|
|
||||||
|
parser.add_argument('--tempdir', type=str,
|
||||||
|
default=None,
|
||||||
|
help=('Temporary directory in --lowmem mode.'))
|
||||||
|
|
||||||
|
|
||||||
return parser.parse_args()
|
return parser.parse_args()
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
@ -1842,7 +1851,7 @@ if __name__ == "__main__":
|
|||||||
else:
|
else:
|
||||||
args = parse_command_line()
|
args = parse_command_line()
|
||||||
#initialize
|
#initialize
|
||||||
combiner = Combine_TMs([(m,'primary') for m in args.model],weights=args.weights,mode=args.mode,output_file=args.output,reference_file=args.reference,output_lexical=args.output_lexical,lowmem=args.lowmem,normalized=args.normalized,recompute_lexweights=args.recompute_lexweights)
|
combiner = Combine_TMs([(m,'primary') for m in args.model],weights=args.weights,mode=args.mode,output_file=args.output,reference_file=args.reference,output_lexical=args.output_lexical,lowmem=args.lowmem,normalized=args.normalized,recompute_lexweights=args.recompute_lexweights,tempdir=args.tempdir)
|
||||||
# execute right method
|
# execute right method
|
||||||
f_string = "combiner."+args.action+'()'
|
f_string = "combiner."+args.action+'()'
|
||||||
exec(f_string)
|
exec(f_string)
|
||||||
|
Loading…
Reference in New Issue
Block a user