mosesdecoder/scripts/nbest-rescore/train.py

#!/usr/bin/env python
#
# This file is part of moses.  Its use is licensed under the GNU Lesser General
# Public License version 2.1 or, at your option, any later version.

import argparse
import os
import subprocess
import sys

# Feature field in N-best format
FEAT_FIELD = 2

# Location of mert, kbmira, etc. in relation to this script
BIN_DIR = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), 'bin')

def main():

    # Args
    parser = argparse.ArgumentParser(description='Learn N-best rescoring weights')
    parser.add_argument('--nbest', metavar='nbest', \
            help='Dev set N-best list augmented with new features', required=True)
    parser.add_argument('--ref', metavar='ref', \
            help='Dev set reference translation', required=True)
    parser.add_argument('--working-dir', metavar='rescore-work', \
            help='Optimizer working directory', required=True)
    parser.add_argument('--bin-dir', metavar='DIR', \
            help='Moses bin dir, containing kbmira, evaluator, etc.', default=BIN_DIR)
    # Since we're starting with uniform weights and only running kbmira once,
    # run a gratuitous number of iterations.  (mert-moses.pl default is 60
    # iterations for each Moses run)
    parser.add_argument('--iterations', metavar='N', type=int, \
            help='Number of K-best MIRA iterations to run (default: 300)', default=300)
    args = parser.parse_args()

    # Find executables
    extractor = os.path.join(args.bin_dir, 'extractor')
    kbmira = os.path.join(args.bin_dir, 'kbmira')
    for exe in (extractor, kbmira):
        if not os.path.exists(exe):
            sys.stderr.write('Error: cannot find executable "{}" in "{}", please specify --bin-dir\n'.format(exe, args.bin_dir))
            sys.exit(1)

    # rescore-work dir
    if not os.path.exists(args.working_dir):
        os.mkdir(args.working_dir)

    # Feature names and numbers of weights from N-best list
    # Assume all features are dense (present for each entry)
    init_weights = []
    fields = [f.strip() for f in open(args.nbest).readline().split('|||')]
    feats = fields[FEAT_FIELD].split()
    for i in range(len(feats)):
        if feats[i].endswith('='):
            n_weights = 0
            j = i + 1
            while j < len(feats):
                if feats[j].endswith('='):
                    break
                n_weights += 1
                j += 1
            # Start all weights at 0
            init_weights.append([feats[i], [0] * n_weights])

    # Extract score and feature data from N-best list
    extractor_cmd = [extractor, \
            '--sctype', 'BLEU', '--scconfig', 'case:true', \
            '--scfile', os.path.join(args.working_dir, 'scores.dat'), \
            '--ffile', os.path.join(args.working_dir, 'features.dat'), \
            '-r', args.ref, \
            '-n', args.nbest]
    subprocess.call(extractor_cmd)

    # Write dense feature list
    with open(os.path.join(args.working_dir, 'init.dense'), 'w') as out:
        for (feat, weights) in init_weights:
            for w in weights:
                out.write('{} {}\n'.format(feat, w))

    # Run K-best MIRA optimizer
    kbmira_cmd = [kbmira, \
            '--dense-init', os.path.join(args.working_dir, 'init.dense'), \
            '--ffile', os.path.join(args.working_dir, 'features.dat'), \
            '--scfile', os.path.join(args.working_dir, 'scores.dat'), \
            '-o', os.path.join(args.working_dir, 'mert.out'), \
            '--iters', str(args.iterations)]
    subprocess.call(kbmira_cmd)

    # Read optimized weights, sum for normalization
    opt_weights = []
    total = 0
    with open(os.path.join(args.working_dir, 'mert.out')) as inp:
        # Same structure as original weight list
        for (feat, weights) in init_weights:
            opt_weights.append([feat, []])
            for _ in weights:
                w = float(inp.readline().split()[1])
                opt_weights[-1][1].append(w)
                # Sum for normalization
                total += abs(w)

    # Normalize weights
    for (_, weights) in opt_weights:
        for i in range(len(weights)):
            weights[i] /= total

    # Generate rescore.ini
    with open(os.path.join(args.working_dir, 'rescore.ini'), 'w') as out:
        out.write('# For use with Moses N-best rescorer "scripts/nbest-rescore/rescore.py"\n')
        out.write('\n')
        out.write('[weight]\n')
        for (feat, weights) in opt_weights:
            out.write('{} {}\n'.format(feat, ' '.join(str(w) for w in weights)))

if __name__ == '__main__':
    main()