Merge branch 'master' of github.com:moses-smt/mosesdecoder

2025-01-08 20:46:59 +03:00 · 2015-11-24 15:10:15 +00:00 · 2015-11-24 15:10:15 +00:00 · 5fea1ad548
commit 5fea1ad548
parent 8678e0361f 710915c088
1 changed files with 775 additions and 0 deletions
--- a/scripts/generic/score_parallel.py
+++ b/scripts/generic/score_parallel.py
@ -0,0 +1,775 @@
+#! /usr/bin/env python
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
+#
+# Script contributed by Precision Translation Tools.
+
+"""Run Moses `score` jobs in parallel.
+
+This script is a replacement for `score-parallel.perl`.  The two are similar,
+but there are differences in usage.  In addition, this script can be called
+directly from Python code without the need to run it as a separate process.
+"""
+
+from __future__ import (
+    absolute_import,
+    print_function,
+    unicode_literals,
+    )
+
+__metaclass__ = type
+
+from argparse import ArgumentParser
+from contextlib import contextmanager
+from datetime import datetime
+import errno
+import gzip
+from multiprocessing import Pool
+import os
+import os.path
+import pipes
+from shutil import rmtree
+from subprocess import check_call
+import sys
+import tempfile
+
+
+def get_unicode_type():
+    """Return the Unicode string type appropriate to this Python version."""
+    if sys.version_info.major <= 2:
+        # Unicode string type.  In Python 2 this is the "unicode" type,
+        # while "str" is a binary string type.
+        return unicode
+    else:
+        # Unicode string type.  In Python 3 this is the default "str" type.
+        # The binary string type is now called "bytes".
+        return str
+
+
+UNICODE_TYPE = get_unicode_type()
+
+
+class CommandLineError(Exception):
+    """Invalid command line."""
+
+
+class ProgramFailure(Exception):
+    """Failure, not a bug, which is reported neatly to the user."""
+
+
+def parse_args():
+    """Parse command line arguments, return as `Namespace`."""
+    parser = ArgumentParser(description=__doc__)
+    parser.add_argument(
+        '--extract-file', '-e', metavar='PATH', required=True,
+        help=(
+            "Path to input file: extract file (e.g. 'extract.sorted.gz' or "
+            "'extract.inv.sorted.gz').  Required."))
+    parser.add_argument(
+        '--lex-file', '-l', metavar='PATH', required=True,
+        help=(
+            "Path to input file: lex file (e.g. 'lex.f2e' or 'lex.e2f').  "
+            "Required."))
+    parser.add_argument(
+        '--output', '-o', metavar='PATH', required=True,
+        help=(
+            "Write phrase table to file PATH (e.g. 'phrase-table.half.f2e' "
+            "or 'phrase-table.half.e2f').  Required."))
+    parser.add_argument(
+        '--inverse', '-i', action='store_true',
+        help="Inverse scoring.  Defaults to direct scoring.")
+    parser.add_argument(
+        '--labels-file', '-L', metavar='PATH',
+        help="Also write source labels to file PATH.")
+    parser.add_argument(
+        '--parts-of-speech', '-p', metavar='PATH',
+        help="Also write parts-of-speech file to PATH.")
+    parser.add_argument(
+        '--flexibility-score', '-F', metavar='PATH',
+        help="Path to the 'flexibility_score.py' script.  Defaults to none.")
+    parser.add_argument(
+        '--hierarchical', '-H', action='store_true',
+        help="Process hierarchical rules.")
+    parser.add_argument(
+        '--args', '-a', metavar='ARGUMENTS',
+        help="Additional arguments for `score` and `flexibility_score`.")
+    parser.add_argument(
+        '--sort', '-s', action='store_true',
+        help="Sort output file.")
+    parser.add_argument(
+        '--jobs', '-j', metavar='N', type=int, default=1,
+        help="Run up to N jobs in parallel.  Defaults to %(default)s.")
+    parser.add_argument(
+        '--score-exe', '-x', metavar='PROGRAM',
+        help="Name of, or path to, the 'score' executable.")
+    parser.add_argument(
+        '--sort-command', '-S', metavar='COMMAND-LINE',
+        help=(
+            "Command line for sorting text files to standard output.  "
+            "Must support operation as a pipe, as well as input files named "
+            "as command-line arguments."))
+    parser.add_argument(
+        '--gzip-command', '-z', metavar='PROGRAM',
+        help="Path to a gzip or pigz executable.")
+    parser.add_argument(
+        '--verbose', '-v', action='store_true',
+        help="Print what's going on.")
+    parser.add_argument(
+        '--debug', '-d', action='store_true',
+        help="Don't delete temporary directories when done.")
+    return parser.parse_args()
+
+
+def normalize_path(optional_path=None):
+    """Return a cleaned-up version of a given filesystem path, or None.
+
+    Converts the path to the operating system's native conventions, and
+    removes redundancies like `.`.
+
+    The return value will be `None`, an absolute path, or a relative path,
+    same as the argument.  But it will have redundant path separators,
+    unnecessary detours through parent directories, and use of the current
+    directory "." removed.
+    """
+    if optional_path is None:
+        return None
+    else:
+        path = os.path.normpath(optional_path)
+        path = path.replace('/', os.path.sep)
+        path = path.replace('\\', os.path.sep)
+        return path
+
+
+def quote(path):
+    """Quote and escape a filename for use in a shell command.
+
+    The Windows implementation is very limited and will break on anything
+    more advanced than a space.
+    """
+    if os.name == 'posix':
+        return pipes.quote(path)
+    else:
+        # TODO: Improve escaping for Windows.
+        return '"%s"' % path
+
+
+def sanitize_args(args):
+    """Check `args` for sanity, clean up, and set nontrivial defaults."""
+    if args.jobs < 1:
+        raise CommandLineError("Number of parallel jobs must be 1 or more.")
+    if args.sort_command is None:
+        args.sort_command = find_first_executable(
+            ['neandersort', 'gsort', 'sort'])
+    if args.sort_command is None:
+        raise CommandLineError(
+            "No 'sort' command is available.  "
+            "Choose one using the --sort-command option.")
+    if args.gzip_command is None:
+        args.gzip_command = find_first_executable(['pigz', 'gzip'])
+    if args.gzip_command is None:
+        raise CommandLineError(
+            "No 'gzip' or 'pigz' command is available.  "
+            "Choose one using the --gzip-command option.")
+    if args.score_exe is None:
+        # Look for "score" executable.  It may be in the current project
+        # directory somewhere, or in the PATH.
+        moses_dir = os.path.dirname(os.path.dirname(
+            os.path.abspath(__file__)))
+        args.score_exe = find_first_executable(
+            ['score'],
+            [
+                moses_dir,
+                os.path.join(moses_dir, 'phrase-extract'),
+                os.path.join(moses_dir, 'binaries'),
+            ])
+    args.extract_file = normalize_path(args.extract_file)
+    args.lex_file = normalize_path(args.lex_file)
+    args.output = normalize_path(args.output)
+    args.labels_file = normalize_path(args.labels_file)
+    args.parts_of_speech = normalize_path(args.parts_of_speech)
+    args.flexibility_score = normalize_path(args.flexibility_score)
+    args.score_exe = normalize_path(args.score_exe)
+
+
+def add_exe_suffix(program):
+    """Return the full filename for an executable.
+
+    On Windows, this adds a `.exe` suffix to the name.  On other
+    systems, it returns the original name unchanged.
+    """
+    if os.name == 'nt':
+        # Windows.
+        return program + '.exe'
+    else:
+        # Assume POSIX or similar.
+        return program
+
+
+def find_executable(exe, extra_path=None):
+    """Return full path to an executable of the given name, or `None`.
+
+    If the given name is a qualified path to an executable, it will be returned
+    unchanged.  A qualified path where no executable is found results in a
+    `CommandLineError`.
+    """
+    if extra_path is None:
+        extra_path = []
+
+    if os.path.sep in exe:
+        # The executable name includes a path.  Only one place it can be.
+        if not os.path.isfile(exe) or not os.access(exe, os.X_OK):
+            raise CommandLineError("Not an executable: '%s'." % exe)
+        return exe
+
+    for path in extra_path + os.getenv('PATH').split(os.pathsep):
+        full_path = os.path.join(path, exe)
+        if os.access(full_path, os.X_OK):
+            return full_path
+    return None
+
+
+def find_first_executable(candidates, extra_path=None):
+    """Find the first available of the given candidate programs.
+
+    :raise ProgramFailure: If none of `candidates` was found.
+    """
+    for program in candidates:
+        executable = find_executable(add_exe_suffix(program), extra_path)
+        if executable is not None:
+            return executable
+    raise ProgramFailure(
+        "Could not find any of these executables in path: %s."
+        % ', '.join(candidates))
+
+
+def execute_shell(command, verbose=False):
+    """Run `command` string through the shell.
+
+    Inherits environment, but sets `LC_ALL` to `C` for predictable results,
+    especially from sort commands.
+
+    This uses a full-featured shell, including pipes, substitution, etc.  So
+    remember to quote/escape arguments where appropriate!
+    """
+    assert isinstance(command, UNICODE_TYPE), (
+        "Wrong argument for execute_shell.")
+    if verbose:
+        print("Executing: %s" % command)
+    env = os.environ.copy()
+    if os.name == 'posix':
+        env['LC_ALL'] = 'C'
+    check_call(command, shell=True, env=env)
+
+
+@contextmanager
+def tempdir(keep=False):
+    """Context manager: temporary directory."""
+    directory = tempfile.mkdtemp()
+    yield directory
+    if not keep:
+        rmtree(directory)
+
+
+def make_dirs(path):
+    """Equivalent to `mkdir -p -- path`."""
+    try:
+        os.makedirs(path)
+    except OSError as error:
+        if error.errno != errno.EEXIST:
+            raise
+
+
+def open_file(path, mode='r'):
+    """Open a file, which may be gzip-compressed."""
+    if path.endswith('.gz'):
+        return gzip.open(path, mode)
+    else:
+        return open(path, mode)
+
+
+def count_lines(filename):
+    """Count the number of lines in `filename` (may be gzip-compressed)."""
+    count = 0
+    with open_file(filename) as stream:
+        for _ in stream:
+            count += 1
+    return count
+
+
+def set_temp_dir():
+    """Set temporary directory to `$MOSES_TEMP_DIR`, if set.
+
+    Create the directory if necessary.
+    """
+    temp_dir = os.getenv('MOSES_TEMP_DIR')
+    if temp_dir is not None:
+        make_dirs(temp_dir)
+        tempfile.tempdir = temp_dir
+
+
+def strip_newline(line):
+    """Remove trailing carriage return and/or line feed, if present."""
+    if line.endswith('\n'):
+        line = line[:-1]
+    if line.endswith('\r'):
+        line = line[:-1]
+    return line
+
+
+def open_chunk_file(split_dir, chunk_number):
+    """Open a file to write one chunk of the extract file."""
+    return open_file(
+        os.path.join(split_dir, 'extract.%d.gz' % chunk_number), 'w')
+
+
+def name_context_chunk_file(split_dir, chunk_number):
+    """Compose file name for one chunk of the extract context file."""
+    return os.path.join(
+        split_dir, 'extract.context.%d.gz' % chunk_number)
+
+
+def extract_source_phrase(line):
+    """Extract the source phrase from an extract-file line."""
+    return line.split(b'|||', 1)[0]
+
+
+def cut_context_file(last_source_phrase, chunk_file, last_line,
+                     context_stream):
+    """Write one chunk of extract context file into its own file.
+
+    :param last_source_phrase: Last source phrase that should be in the
+        chunk.  Stop processing after this source phrase.
+    :param chunk_file: Path to the extract context file for this chunk.
+    :param last_line: Previously read line that may still need writing.
+    :param context_stream: Extract context file, opened for reading.
+    :return: Last line read from `context_stream`.  This line will still
+        need processing.
+    """
+    # TODO: Use open_file.
+    with gzip.open(chunk_file, 'w') as chunk:
+        if last_line is not None:
+            chunk.write('%s\n' % last_line)
+
+        # Are we processing our last source phrase yet?
+        on_last_source_phrase = False
+
+        # Write all lines in context file until we meet last source phrase
+        # in extract file.
+        for line in context_stream:
+            # Reading from a gzip file returns lines *including the newline*.
+            # Either way, we want to ignore carriage returns as well.
+            line = strip_newline(line)
+            source_phrase = extract_source_phrase(line)
+            if on_last_source_phrase and source_phrase != last_source_phrase:
+                # First new source phrase after our last one.  We're done.
+                return line
+            else:
+                # Still adding lines to our chunk.
+                chunk.write('%s\n' % line)
+                if source_phrase == last_source_phrase:
+                    # We're on our last source phrase now.
+                    on_last_source_phrase = True
+
+
+def split_extract_files(split_dir, extract_file, extract_context_file=None,
+                        jobs=1):
+    """Split extract file into chunks, so we can process them in parallel.
+
+    :param split_dir: A temporary directory where this function can write
+        temporary files.  The caller must ensure that this directory will be
+        cleaned up after it's done with the files.
+    :return: An iterable of tuples.  Each tuple hols a partial extract file,
+        and the corresponding context file.  The files may be in `split_dir`,
+        or there may just be the original extract file.
+    """
+    if jobs == 1:
+        # No splitting needed.  Read the original file(s).
+        return [(extract_file, extract_context_file)]
+
+    # Otherwise: split files.
+    files = []
+    num_lines = count_lines(extract_file)
+    chunk_size = (num_lines + jobs - 1) / jobs
+    assert isinstance(chunk_size, int)
+
+    line_count = 0
+    chunk_number = 0
+    prev_source_phrase = None
+    last_line_context = None
+    extract_stream = open_file(extract_file)
+    chunk_file = open_chunk_file(split_dir, chunk_number)
+    if extract_context_file is None:
+        chunk_context_file = None
+    if extract_context_file is not None:
+        context_stream = open_file(extract_context_file)
+
+    for line in extract_stream:
+        line_count += 1
+        line = line.decode('utf-8')
+        line = strip_newline(line)
+        if line_count >= chunk_size:
+            # At or over chunk size.  Cut off at next source phrase change.
+            source_phrase = extract_source_phrase(line)
+            if prev_source_phrase is None:
+                # Start looking for a different source phrase.
+                prev_source_phrase = source_phrase
+            elif source_phrase == prev_source_phrase:
+                # Can't cut yet.  Still working on the same source phrase.
+                pass
+            else:
+                # Hit first new source phrase after chunk limit.  Cut new
+                # file(s).
+                chunk_file.close()
+                if extract_context_file is not None:
+                    chunk_context_file = name_context_chunk_file(
+                        split_dir, chunk_number)
+                    last_line_context = cut_context_file(
+                        prev_source_phrase, chunk_context_file,
+                        last_line_context, context_stream)
+                files.append((chunk_file.name, chunk_context_file))
+
+                # Start on new chunk.
+                prev_source_phrase = None
+                line_count = 0
+                chunk_number += 1
+                chunk_file = open_chunk_file(split_dir, chunk_number)
+        chunk_file.write(('%s\n' % line).encode('utf-8'))
+
+    chunk_file.close()
+    if extract_context_file is not None:
+        chunk_context_file = name_context_chunk_file(split_dir, chunk_number)
+        last_line_context = cut_context_file(
+            prev_source_phrase, chunk_number, last_line_context,
+            context_stream)
+    files.append((chunk_file.name, chunk_context_file))
+    return files
+
+
+def compose_score_command(extract_file, context_file, half_file,
+                          flex_half_file, args):
+    """Compose command line text to run one instance of `score`.
+
+    :param extract_file: One chunk of extract file.
+    :param context_file: If doing flexibility scoring, one chunk of
+        extract context file.  Otherwise, None.
+    :param half_file: ???
+    :param flex_half_file: ???
+    :param args: Arguments namespace.
+    """
+    command = [
+        args.score_exe,
+        extract_file,
+        args.lex_file,
+        half_file,
+        ]
+    if args.args not in (None, ''):
+        command.append(args.args)
+    other_args = build_score_args(args)
+    if other_args != '':
+        command.append(other_args)
+    if context_file is not None:
+        command += [
+            '&&',
+            find_first_executable(['bzcat']),
+            '|',
+            quote(args.flexibility_score),
+            quote(context_file),
+            ]
+        if args.inverse:
+            command.append('--Inverse')
+        if args.hierarchical:
+            command.append('--Hierarchical')
+        command += [
+            '|',
+            quote(args.gzip_command),
+            '-c',
+            '>%s' % quote(flex_half_file),
+            ]
+    return ' '.join(command)
+
+
+def score_parallel(split_dir, file_pairs, args):
+    """Run the `score` command in parallel.
+
+    :param split_dir: Temporary directory where we can create split files.
+    :param file_pairs: Sequence of tuples for the input files, one tuple
+        per chunk of the work.  Each tuple consists of a partial extract
+        file, and optionally a partial extract context file.
+    :param args: Arguments namespace.
+    :return: A list of tuples.  Each tuple contains two file paths.  The first
+        is for a partial half-phrase-table file.  The second is for the
+        corresponding partial flex file, if a context file is given; or
+        `None` otherwise.
+    """
+    partial_files = []
+    # Pool of worker processes for executing the partial "score" invocations
+    # concurrently.
+    pool = Pool(args.jobs)
+    try:
+        for chunk_num, file_pair in enumerate(file_pairs):
+            half_file = os.path.join(
+                split_dir, 'phrase-table.half.%06d.gz' % chunk_num)
+            extract_file, context_file = file_pair
+            if context_file is None:
+                flex_half_file = None
+            else:
+                flex_half_file = os.path.join(
+                    split_dir, 'phrase-table.half.%06d.flex.gz' % chunk_num)
+            # Pickling of arguments for the pool is awkward on Windows, so
+            # keep them simple.  Compose the command line in the parent
+            # process, then hand them to worker processes which execute them.
+            command_line = compose_score_command(
+                extract_file, context_file, half_file, flex_half_file, args)
+            pool.apply_async(
+                execute_shell, (command_line, ), {'verbose': args.verbose})
+            partial_files.append((half_file, flex_half_file))
+        pool.close()
+    except BaseException:
+        pool.terminate()
+        raise
+    finally:
+        pool.join()
+    return partial_files
+
+
+def merge_and_sort(files, output, sort_command=None, gzip_exe=None,
+                   verbose=False):
+    """Merge partial files.
+
+    :param files: List of partial half-phrase-table files.
+    :param output: Path for resulting combined phrase-table file.
+    """
+# TODO: The Perl code mentioned "sort" and "flexibility_score" here.
+# What do we do with those?
+
+    # Sort whether we're asked to or not, as a way of combining the input
+    # files.
+    if sort_command == 'neandersort':
+        # Neandersort transparently decompresses input and compresses output.
+        check_call([
+            'neandersort',
+            '-o', output,
+            ] + files)
+    else:
+        command = (
+            "%(gzip)s -c -d %(files)s | "
+            "%(sort)s | "
+            "%(gzip)s -c >>%(output)s"
+            % {
+                'gzip': quote(gzip_exe),
+                'sort': sort_command,
+                'files': ' '.join(map(quote, files)),
+                'output': quote(output),
+            })
+        execute_shell(command, verbose=verbose)
+
+
+def build_score_args(args):
+    """Compose command line for the `score` program."""
+    command_line = []
+    if args.labels_file:
+        command_line += [
+            '--SourceLabels',
+            '--SourceLabelCountsLHS',
+            '--SourceLabelSet',
+            ]
+    if args.parts_of_speech:
+        command_line.append('--PartsOfSpeech')
+    if args.inverse:
+        command_line.append('--Inverse')
+    if args.args is not None:
+        command_line.append(args.args)
+    return ' '.join(command_line)
+
+
+def list_existing(paths):
+    """Return, in the same order, those of the given files which exist."""
+    return filter(os.path.exists, paths)
+
+
+def compose_coc_path_for(path):
+    """Compose COC-file path for the given file."""
+    return '%s.coc' % path
+
+
+def read_cocs(path):
+    """Read COC file at `path`, return contents as tuple of ints."""
+    with open(path) as lines:
+        return tuple(
+            int(line.rstrip('\r\n'))
+            for line in lines
+            )
+
+
+def add_cocs(original, additional):
+    """Add two tuples of COCs.  Extend as needed."""
+    assert not (original is None and additional is None), "No COCs to add!"
+    if original is None:
+        return additional
+    elif additional is None:
+        return original
+    else:
+        common = tuple(lhs + rhs for lhs, rhs in zip(original, additional))
+        return (
+            common +
+            tuple(original[len(common):]) +
+            tuple(additional[len(common):]))
+
+
+def merge_coc(files, output):
+    """Merge COC files for the given partial files.
+
+    Each COC file is a series of integers, one per line.  This reads them, and
+    adds them up line-wise into one file of the same format: the sum of the
+    numbers the respective files have at line 1, the sum of the numbers the
+    respective files have at line 2, and so on.
+    """
+    assert len(files) > 0, "No partial files - no work to do."
+    extract_files = [extract_file for extract_file, _ in files]
+    if not os.path.exists(compose_coc_path_for(extract_files[0])):
+        # Nothing to merge.
+        return
+    totals = None
+# TODO: Shouldn't we just fail if any of these files is missing?
+    for coc_path in list_existing(map(compose_coc_path_for, extract_files)):
+        totals = add_cocs(totals, read_cocs(coc_path))
+
+    # Write to output file.
+    with open(output, 'w') as output_stream:
+        for entry in totals:
+            output_stream.write('%d\n' % entry)
+
+
+def suffix_line_numbers(infile, outfile):
+    """Rewrite `infile` to `outfile`; suffix line number to each line.
+
+    The line number is zero-based, and separated from the rest of the line
+    by a single space.
+    """
+    temp_file = '%s.numbering' % outfile
+    with open(infile, 'r') as instream, open(outfile, 'w') as outstream:
+        line_no = 0
+        for line in instream:
+            outstream.write(line)
+            outstream.write(' %d\n' % line_no)
+            line_no += 1
+    os.rename(temp_file, outfile)
+
+
+def compose_source_labels_path_for(path):
+    """Return source labels file path for given file."""
+    return '%s.syntaxLabels.src' % path
+
+
+def merge_numbered_files(inputs, output, header_lines, sort_command,
+                         verbose=False):
+    """Sort and merge files `inputs`, add header and line numbers.
+
+    :param inputs: Iterable of input files.
+    :param output: Output file.
+    :header_lines: Iterable of header lines.
+    :sort_command: Command line for sorting input files.
+    """
+    sort_temp = '%s.sorting' % output
+    with open(sort_temp, 'w') as stream:
+        for line in header_lines:
+            stream.write(line)
+            stream.write('\n')
+    execute_shell(
+        "%s %s >>%s" % (
+            sort_command,
+            ' '.join(map(quote, inputs)),
+            quote(sort_temp)),
+        verbose=verbose)
+    suffix_line_numbers(sort_temp, output)
+
+
+def merge_source_labels(files, output, sort_command, verbose=False):
+    """Merge source labels files."""
+# TODO: Shouldn't we just fail if any of these files is missing?
+    labels_files = list_existing(map(compose_source_labels_path_for, files))
+    header = [
+        'GlueTop',
+        'GlueX',
+        'SSTART',
+        'SEND',
+        ]
+    merge_numbered_files(
+        labels_files, output, header, sort_command, verbose=verbose)
+
+
+def compose_parts_of_speech_path_for(path):
+    """Return parts-of-speech file path for given file."""
+    return '%s.partsOfSpeech' % path
+
+
+def merge_parts_of_speech(files, output, sort_command, verbose=False):
+    """Merge parts-of-speech files into output."""
+# TODO: Shouldn't we just fail if any of these files is missing?
+    parts_files = list_existing(map(compose_parts_of_speech_path_for, files))
+    header = [
+        'SSTART',
+        'SEND',
+        ]
+    merge_numbered_files(
+        parts_files, output, header, sort_command, verbose=verbose)
+
+
+def main():
+    """Command-line entry point.  Marshals and forwards to `score_parallel`."""
+    args = parse_args()
+    sanitize_args(args)
+    set_temp_dir()
+
+    if args.flexibility_score is None:
+        extract_context_file = None
+    else:
+        extract_context_file = args.extract_file.replace(
+            'extract.', 'extract.context.')
+
+    if args.verbose:
+        print("Started %s." % datetime.now())
+        print("Using '%s' for gzip." % args.gzip_command)
+
+    with tempdir(args.debug) as split_dir:
+        extract_files = split_extract_files(
+            split_dir, args.extract_file,
+            extract_context_file=extract_context_file, jobs=args.jobs)
+
+        scored_files = score_parallel(split_dir, extract_files, args)
+
+        if args.verbose:
+            sys.stderr.write("Finished score %s.\n" % datetime.now())
+
+# TODO: Pass on "sort" and "flexibility-score" arguments?
+        merge_and_sort(
+            [phrase_chunk for phrase_chunk, _ in scored_files], args.output,
+            sort_command=args.sort_command, gzip_exe=args.gzip_command,
+            verbose=args.verbose)
+        merge_coc(extract_files, compose_coc_path_for(args.output))
+
+        if not args.inverse and args.labels_file is not None:
+            if args.verbose:
+                print("Merging source labels files.")
+            merge_source_labels(
+                extract_files, args.labels_file,
+                sort_command=args.sort_command, verbose=args.verbose)
+
+        if not args.inverse and args.parts_of_speech is not None:
+            if args.verbose:
+                print("Merging parts-of-speech files.")
+            merge_parts_of_speech(
+                extract_files, args.parts_of_speech,
+                sort_command=args.sort_command, verbose=args.verbose)
+
+
+if __name__ == '__main__':
+    try:
+        main()
+    except ProgramFailure as error:
+        sys.stderr.write('%s\n' % error)
+        sys.exit(1)
+    except CommandLineError as error:
+        sys.stderr.write("Command line error: %s\n" % error)
+        sys.exit(2)