wrapper for stanford dependency parser

2024-12-25 12:52:29 +03:00 · 2015-05-22 15:28:42 +01:00 · 2015-05-22 15:28:42 +01:00 · a1678187fe
commit a1678187fe
parent 30a03237fa
1 changed files with 129 additions and 0 deletions
--- a/scripts/training/wrappers/parse-en-stanford.py
+++ b/scripts/training/wrappers/parse-en-stanford.py
@ -0,0 +1,129 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+# Author: Rico Sennrich
+
+# (hacky) wrapper around Stanford CoreNLP to produce CoNLL dependency format.
+# assumes tokenized and sentence-split text.
+
+# to get Moses XML format, first projectivize the trees, then use conll2mosesxml.py.
+
+from __future__ import print_function, unicode_literals
+import os
+import sys
+import codecs
+import argparse
+
+from collections import defaultdict
+from subprocess import Popen, PIPE
+
+# hack for python2/3 compatibility
+from io import open
+argparse.open = open
+
+
+def create_parser():
+    parser = argparse.ArgumentParser(
+        description=(
+            """Wrapper around Stanford CoreNLP to produce CoNLL dependency format.
+            Assumes that text is tokenized and has one sentence per line."""))
+
+    parser.add_argument(
+        '--stanford', type=str,
+        metavar='PATH', required=True,
+        help='path to Stanford CoreNLP')
+
+    parser.add_argument(
+        '--java', type=str, default='java',
+        metavar='PATH',
+        help='path to java executable')
+
+    parser.add_argument(
+        '--input', '-i', type=argparse.FileType('r'), default=sys.stdin,
+        metavar='PATH',
+        help="Input text (default: standard input).")
+    parser.add_argument(
+        '--output', '-o', type=argparse.FileType('w'), default=sys.stdout,
+        metavar='PATH',
+        help="Output text (default: standard output).")
+
+    return parser
+
+def process_stanford(infile, javacmd, stanfordpath):
+
+    stanford = Popen([javacmd,
+               '-cp', os.path.join(stanfordpath, 'stanford-corenlp-3.5.0.jar') + ':' + os.path.join(stanfordpath, 'stanford-corenlp-3.5.0-models.jar'),
+               'edu.stanford.nlp.pipeline.StanfordCoreNLP',
+               '-annotators', 'tokenize, ssplit, pos, depparse, lemma',
+               '-ssplit.eolonly', 'true',
+               '-tokenize.whitespace', 'true',
+               '-numThreads', '8',
+               '-textFile', '-',
+               'outFile', '-'], stdin=infile, stdout = PIPE, stderr = open('/dev/null', 'w'))
+    return stanford.stdout
+
+
+def get_sentences(instream):
+    sentence = []
+    expect = 0
+
+    for line in instream:
+        if expect == 0 and line.startswith('Sentence #'):
+            if sentence:
+                yield sentence
+            sentence = []
+            expect = 1
+
+        elif line == '\n':
+            expect = 0
+
+        elif expect == 3:
+            rel, remainder = line.split('(')
+            head, dep = remainder.split()
+            head_int = int(head.split('-')[-1][:-1])
+            dep_int = int(dep.split('-')[-1][:-1])
+            sentence[dep_int-1]['head'] = head_int
+            sentence[dep_int-1]['label'] = rel
+
+        elif expect == 2:
+            linesplit = line.split('[',1)[1].rsplit(']',1)[0].split('] [')
+            if len(linesplit) != len(sentence):
+                sys.stderr.write('Warning: mismatch in number of words in sentence\n')
+                sys.stderr.write(' '.join(w['word'] for w in sentence))
+                for i in range(len(sentence)):
+                    sentence[i]['pos'] = '-'
+                    sentence[i]['lemma'] = '-'
+                    sentence[i]['head'] = 0
+                    sentence[i]['label'] = '-'
+                expect = 0
+                continue
+            for i,w in enumerate(linesplit):
+                sentence[i]['pos'] = w.split(' PartOfSpeech=')[-1].split()[0]
+                sentence[i]['lemma'] = w.split(' Lemma=')[-1]
+            expect = 3
+
+        elif expect == 1:
+            for w in line.split():
+                sentence.append({'word':w})
+            expect = 2
+
+    if sentence:
+        yield sentence
+
+def write(sentence, outstream):
+    for i, w in enumerate(sentence):
+      outstream.write('{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\n'.format(i+1, w['word'], w['lemma'], w['pos'], w['pos'], '-', w['head'], w['label']))
+
+if __name__ == '__main__':
+    if sys.version_info < (3, 0):
+        sys.stderr = codecs.getwriter('UTF-8')(sys.stderr)
+        sys.stdout = codecs.getwriter('UTF-8')(sys.stdout)
+        sys.stdin = codecs.getreader('UTF-8')(sys.stdin)
+
+
+    parser = create_parser()
+    options = parser.parse_args()
+
+    stanford = process_stanford(options.input, options.java, options.stanford)
+    for sentence in get_sentences(codecs.getreader('UTF-8')(stanford)):
+       write(sentence, options.output)
+       options.output.write('\n')