mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-12-25 12:52:29 +03:00
wrapper for stanford dependency parser
This commit is contained in:
parent
30a03237fa
commit
a1678187fe
129
scripts/training/wrappers/parse-en-stanford.py
Executable file
129
scripts/training/wrappers/parse-en-stanford.py
Executable file
@ -0,0 +1,129 @@
|
||||
#!/usr/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
# Author: Rico Sennrich
|
||||
|
||||
# (hacky) wrapper around Stanford CoreNLP to produce CoNLL dependency format.
|
||||
# assumes tokenized and sentence-split text.
|
||||
|
||||
# to get Moses XML format, first projectivize the trees, then use conll2mosesxml.py.
|
||||
|
||||
from __future__ import print_function, unicode_literals
|
||||
import os
|
||||
import sys
|
||||
import codecs
|
||||
import argparse
|
||||
|
||||
from collections import defaultdict
|
||||
from subprocess import Popen, PIPE
|
||||
|
||||
# hack for python2/3 compatibility
|
||||
from io import open
|
||||
argparse.open = open
|
||||
|
||||
|
||||
def create_parser():
|
||||
parser = argparse.ArgumentParser(
|
||||
description=(
|
||||
"""Wrapper around Stanford CoreNLP to produce CoNLL dependency format.
|
||||
Assumes that text is tokenized and has one sentence per line."""))
|
||||
|
||||
parser.add_argument(
|
||||
'--stanford', type=str,
|
||||
metavar='PATH', required=True,
|
||||
help='path to Stanford CoreNLP')
|
||||
|
||||
parser.add_argument(
|
||||
'--java', type=str, default='java',
|
||||
metavar='PATH',
|
||||
help='path to java executable')
|
||||
|
||||
parser.add_argument(
|
||||
'--input', '-i', type=argparse.FileType('r'), default=sys.stdin,
|
||||
metavar='PATH',
|
||||
help="Input text (default: standard input).")
|
||||
parser.add_argument(
|
||||
'--output', '-o', type=argparse.FileType('w'), default=sys.stdout,
|
||||
metavar='PATH',
|
||||
help="Output text (default: standard output).")
|
||||
|
||||
return parser
|
||||
|
||||
def process_stanford(infile, javacmd, stanfordpath):
|
||||
|
||||
stanford = Popen([javacmd,
|
||||
'-cp', os.path.join(stanfordpath, 'stanford-corenlp-3.5.0.jar') + ':' + os.path.join(stanfordpath, 'stanford-corenlp-3.5.0-models.jar'),
|
||||
'edu.stanford.nlp.pipeline.StanfordCoreNLP',
|
||||
'-annotators', 'tokenize, ssplit, pos, depparse, lemma',
|
||||
'-ssplit.eolonly', 'true',
|
||||
'-tokenize.whitespace', 'true',
|
||||
'-numThreads', '8',
|
||||
'-textFile', '-',
|
||||
'outFile', '-'], stdin=infile, stdout = PIPE, stderr = open('/dev/null', 'w'))
|
||||
return stanford.stdout
|
||||
|
||||
|
||||
def get_sentences(instream):
|
||||
sentence = []
|
||||
expect = 0
|
||||
|
||||
for line in instream:
|
||||
if expect == 0 and line.startswith('Sentence #'):
|
||||
if sentence:
|
||||
yield sentence
|
||||
sentence = []
|
||||
expect = 1
|
||||
|
||||
elif line == '\n':
|
||||
expect = 0
|
||||
|
||||
elif expect == 3:
|
||||
rel, remainder = line.split('(')
|
||||
head, dep = remainder.split()
|
||||
head_int = int(head.split('-')[-1][:-1])
|
||||
dep_int = int(dep.split('-')[-1][:-1])
|
||||
sentence[dep_int-1]['head'] = head_int
|
||||
sentence[dep_int-1]['label'] = rel
|
||||
|
||||
elif expect == 2:
|
||||
linesplit = line.split('[',1)[1].rsplit(']',1)[0].split('] [')
|
||||
if len(linesplit) != len(sentence):
|
||||
sys.stderr.write('Warning: mismatch in number of words in sentence\n')
|
||||
sys.stderr.write(' '.join(w['word'] for w in sentence))
|
||||
for i in range(len(sentence)):
|
||||
sentence[i]['pos'] = '-'
|
||||
sentence[i]['lemma'] = '-'
|
||||
sentence[i]['head'] = 0
|
||||
sentence[i]['label'] = '-'
|
||||
expect = 0
|
||||
continue
|
||||
for i,w in enumerate(linesplit):
|
||||
sentence[i]['pos'] = w.split(' PartOfSpeech=')[-1].split()[0]
|
||||
sentence[i]['lemma'] = w.split(' Lemma=')[-1]
|
||||
expect = 3
|
||||
|
||||
elif expect == 1:
|
||||
for w in line.split():
|
||||
sentence.append({'word':w})
|
||||
expect = 2
|
||||
|
||||
if sentence:
|
||||
yield sentence
|
||||
|
||||
def write(sentence, outstream):
|
||||
for i, w in enumerate(sentence):
|
||||
outstream.write('{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\n'.format(i+1, w['word'], w['lemma'], w['pos'], w['pos'], '-', w['head'], w['label']))
|
||||
|
||||
if __name__ == '__main__':
|
||||
if sys.version_info < (3, 0):
|
||||
sys.stderr = codecs.getwriter('UTF-8')(sys.stderr)
|
||||
sys.stdout = codecs.getwriter('UTF-8')(sys.stdout)
|
||||
sys.stdin = codecs.getreader('UTF-8')(sys.stdin)
|
||||
|
||||
|
||||
parser = create_parser()
|
||||
options = parser.parse_args()
|
||||
|
||||
stanford = process_stanford(options.input, options.java, options.stanford)
|
||||
for sentence in get_sentences(codecs.getreader('UTF-8')(stanford)):
|
||||
write(sentence, options.output)
|
||||
options.output.write('\n')
|
Loading…
Reference in New Issue
Block a user