From a1678187fead90da0e19da5a71a82e421b57ff06 Mon Sep 17 00:00:00 2001 From: Rico Sennrich Date: Fri, 22 May 2015 15:28:42 +0100 Subject: [PATCH] wrapper for stanford dependency parser --- .../training/wrappers/parse-en-stanford.py | 129 ++++++++++++++++++ 1 file changed, 129 insertions(+) create mode 100755 scripts/training/wrappers/parse-en-stanford.py diff --git a/scripts/training/wrappers/parse-en-stanford.py b/scripts/training/wrappers/parse-en-stanford.py new file mode 100755 index 000000000..7d8be4bcf --- /dev/null +++ b/scripts/training/wrappers/parse-en-stanford.py @@ -0,0 +1,129 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- +# Author: Rico Sennrich + +# (hacky) wrapper around Stanford CoreNLP to produce CoNLL dependency format. +# assumes tokenized and sentence-split text. + +# to get Moses XML format, first projectivize the trees, then use conll2mosesxml.py. + +from __future__ import print_function, unicode_literals +import os +import sys +import codecs +import argparse + +from collections import defaultdict +from subprocess import Popen, PIPE + +# hack for python2/3 compatibility +from io import open +argparse.open = open + + +def create_parser(): + parser = argparse.ArgumentParser( + description=( + """Wrapper around Stanford CoreNLP to produce CoNLL dependency format. + Assumes that text is tokenized and has one sentence per line.""")) + + parser.add_argument( + '--stanford', type=str, + metavar='PATH', required=True, + help='path to Stanford CoreNLP') + + parser.add_argument( + '--java', type=str, default='java', + metavar='PATH', + help='path to java executable') + + parser.add_argument( + '--input', '-i', type=argparse.FileType('r'), default=sys.stdin, + metavar='PATH', + help="Input text (default: standard input).") + parser.add_argument( + '--output', '-o', type=argparse.FileType('w'), default=sys.stdout, + metavar='PATH', + help="Output text (default: standard output).") + + return parser + +def process_stanford(infile, javacmd, stanfordpath): + + stanford = Popen([javacmd, + '-cp', os.path.join(stanfordpath, 'stanford-corenlp-3.5.0.jar') + ':' + os.path.join(stanfordpath, 'stanford-corenlp-3.5.0-models.jar'), + 'edu.stanford.nlp.pipeline.StanfordCoreNLP', + '-annotators', 'tokenize, ssplit, pos, depparse, lemma', + '-ssplit.eolonly', 'true', + '-tokenize.whitespace', 'true', + '-numThreads', '8', + '-textFile', '-', + 'outFile', '-'], stdin=infile, stdout = PIPE, stderr = open('/dev/null', 'w')) + return stanford.stdout + + +def get_sentences(instream): + sentence = [] + expect = 0 + + for line in instream: + if expect == 0 and line.startswith('Sentence #'): + if sentence: + yield sentence + sentence = [] + expect = 1 + + elif line == '\n': + expect = 0 + + elif expect == 3: + rel, remainder = line.split('(') + head, dep = remainder.split() + head_int = int(head.split('-')[-1][:-1]) + dep_int = int(dep.split('-')[-1][:-1]) + sentence[dep_int-1]['head'] = head_int + sentence[dep_int-1]['label'] = rel + + elif expect == 2: + linesplit = line.split('[',1)[1].rsplit(']',1)[0].split('] [') + if len(linesplit) != len(sentence): + sys.stderr.write('Warning: mismatch in number of words in sentence\n') + sys.stderr.write(' '.join(w['word'] for w in sentence)) + for i in range(len(sentence)): + sentence[i]['pos'] = '-' + sentence[i]['lemma'] = '-' + sentence[i]['head'] = 0 + sentence[i]['label'] = '-' + expect = 0 + continue + for i,w in enumerate(linesplit): + sentence[i]['pos'] = w.split(' PartOfSpeech=')[-1].split()[0] + sentence[i]['lemma'] = w.split(' Lemma=')[-1] + expect = 3 + + elif expect == 1: + for w in line.split(): + sentence.append({'word':w}) + expect = 2 + + if sentence: + yield sentence + +def write(sentence, outstream): + for i, w in enumerate(sentence): + outstream.write('{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\n'.format(i+1, w['word'], w['lemma'], w['pos'], w['pos'], '-', w['head'], w['label'])) + +if __name__ == '__main__': + if sys.version_info < (3, 0): + sys.stderr = codecs.getwriter('UTF-8')(sys.stderr) + sys.stdout = codecs.getwriter('UTF-8')(sys.stdout) + sys.stdin = codecs.getreader('UTF-8')(sys.stdin) + + + parser = create_parser() + options = parser.parse_args() + + stanford = process_stanford(options.input, options.java, options.stanford) + for sentence in get_sentences(codecs.getreader('UTF-8')(stanford)): + write(sentence, options.output) + options.output.write('\n')