mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-10-06 00:08:52 +03:00
117 lines
4.5 KiB
Python
Executable File
117 lines
4.5 KiB
Python
Executable File
#!/usr/bin/env python
|
|
#
|
|
# This file is part of moses. Its use is licensed under the GNU Lesser General
|
|
# Public License version 2.1 or, at your option, any later version.
|
|
|
|
import argparse
|
|
import os
|
|
import subprocess
|
|
import sys
|
|
|
|
# Feature field in N-best format
|
|
FEAT_FIELD = 2
|
|
|
|
# Location of mert, kbmira, etc. in relation to this script
|
|
BIN_DIR = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), 'bin')
|
|
|
|
def main():
|
|
|
|
# Args
|
|
parser = argparse.ArgumentParser(description='Learn N-best rescoring weights')
|
|
parser.add_argument('--nbest', metavar='nbest', \
|
|
help='Dev set N-best list augmented with new features', required=True)
|
|
parser.add_argument('--ref', metavar='ref', \
|
|
help='Dev set reference translation', required=True)
|
|
parser.add_argument('--working-dir', metavar='rescore-work', \
|
|
help='Optimizer working directory', required=True)
|
|
parser.add_argument('--bin-dir', metavar='DIR', \
|
|
help='Moses bin dir, containing kbmira, evaluator, etc.', default=BIN_DIR)
|
|
# Since we're starting with uniform weights and only running kbmira once,
|
|
# run a gratuitous number of iterations. (mert-moses.pl default is 60
|
|
# iterations for each Moses run)
|
|
parser.add_argument('--iterations', metavar='N', type=int, \
|
|
help='Number of K-best MIRA iterations to run (default: 300)', default=300)
|
|
args = parser.parse_args()
|
|
|
|
# Find executables
|
|
extractor = os.path.join(args.bin_dir, 'extractor')
|
|
kbmira = os.path.join(args.bin_dir, 'kbmira')
|
|
for exe in (extractor, kbmira):
|
|
if not os.path.exists(exe):
|
|
sys.stderr.write('Error: cannot find executable "{}" in "{}", please specify --bin-dir\n'.format(exe, args.bin_dir))
|
|
sys.exit(1)
|
|
|
|
# rescore-work dir
|
|
if not os.path.exists(args.working_dir):
|
|
os.mkdir(args.working_dir)
|
|
|
|
# Feature names and numbers of weights from N-best list
|
|
# Assume all features are dense (present for each entry)
|
|
init_weights = []
|
|
fields = [f.strip() for f in open(args.nbest).readline().split('|||')]
|
|
feats = fields[FEAT_FIELD].split()
|
|
for i in range(len(feats)):
|
|
if feats[i].endswith('='):
|
|
n_weights = 0
|
|
j = i + 1
|
|
while j < len(feats):
|
|
if feats[j].endswith('='):
|
|
break
|
|
n_weights += 1
|
|
j += 1
|
|
# Start all weights at 0
|
|
init_weights.append([feats[i], [0] * n_weights])
|
|
|
|
# Extract score and feature data from N-best list
|
|
extractor_cmd = [extractor, \
|
|
'--sctype', 'BLEU', '--scconfig', 'case:true', \
|
|
'--scfile', os.path.join(args.working_dir, 'scores.dat'), \
|
|
'--ffile', os.path.join(args.working_dir, 'features.dat'), \
|
|
'-r', args.ref, \
|
|
'-n', args.nbest]
|
|
subprocess.call(extractor_cmd)
|
|
|
|
# Write dense feature list
|
|
with open(os.path.join(args.working_dir, 'init.dense'), 'w') as out:
|
|
for (feat, weights) in init_weights:
|
|
for w in weights:
|
|
out.write('{} {}\n'.format(feat, w))
|
|
|
|
# Run K-best MIRA optimizer
|
|
kbmira_cmd = [kbmira, \
|
|
'--dense-init', os.path.join(args.working_dir, 'init.dense'), \
|
|
'--ffile', os.path.join(args.working_dir, 'features.dat'), \
|
|
'--scfile', os.path.join(args.working_dir, 'scores.dat'), \
|
|
'-o', os.path.join(args.working_dir, 'mert.out'), \
|
|
'--iters', str(args.iterations)]
|
|
subprocess.call(kbmira_cmd)
|
|
|
|
# Read optimized weights, sum for normalization
|
|
opt_weights = []
|
|
total = 0
|
|
with open(os.path.join(args.working_dir, 'mert.out')) as inp:
|
|
# Same structure as original weight list
|
|
for (feat, weights) in init_weights:
|
|
opt_weights.append([feat, []])
|
|
for _ in weights:
|
|
w = float(inp.readline().split()[1])
|
|
opt_weights[-1][1].append(w)
|
|
# Sum for normalization
|
|
total += abs(w)
|
|
|
|
# Normalize weights
|
|
for (_, weights) in opt_weights:
|
|
for i in range(len(weights)):
|
|
weights[i] /= total
|
|
|
|
# Generate rescore.ini
|
|
with open(os.path.join(args.working_dir, 'rescore.ini'), 'w') as out:
|
|
out.write('# For use with Moses N-best rescorer "scripts/nbest-rescore/rescore.py"\n')
|
|
out.write('\n')
|
|
out.write('[weight]\n')
|
|
for (feat, weights) in opt_weights:
|
|
out.write('{} {}\n'.format(feat, ' '.join(str(w) for w in weights)))
|
|
|
|
if __name__ == '__main__':
|
|
main()
|