mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2025-01-07 12:10:36 +03:00
61162dd242
Most of the complaints fixed here were from Pocketlint, but many were also from Syntastic the vim plugin.
56 lines
1.3 KiB
Python
Executable File
56 lines
1.3 KiB
Python
Executable File
#!/usr/bin/env python
|
|
# -*- coding: utf-8 -*-
|
|
# Author: Rico Sennrich
|
|
|
|
# convert trees in moses XML format to PTB-style bracketed format
|
|
|
|
from __future__ import print_function, unicode_literals
|
|
import sys
|
|
import codecs
|
|
|
|
from lxml import etree as ET
|
|
|
|
|
|
def escape(word):
|
|
# Factor separator:
|
|
word = word.replace('|', '|')
|
|
# Syntax non-terminal:
|
|
word = word.replace('[', '[')
|
|
# Syntax non-terminal:
|
|
word = word.replace(']', ']')
|
|
word = word.replace('\'', ''')
|
|
word = word.replace('\"', '"')
|
|
|
|
return word
|
|
|
|
|
|
def make_brackets(xml):
|
|
out = ' [' + xml.get('label')
|
|
|
|
if xml.text and xml.text.strip():
|
|
word = escape(xml.text.strip())
|
|
out += ' ' + word + ']'
|
|
|
|
else:
|
|
for child in xml:
|
|
out += make_brackets(child)
|
|
|
|
out += ']'
|
|
|
|
return out
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
if sys.version_info < (3, 0):
|
|
sys.stderr = codecs.getwriter('UTF-8')(sys.stderr)
|
|
sys.stdout = codecs.getwriter('UTF-8')(sys.stdout)
|
|
sys.stdin = codecs.getreader('UTF-8')(sys.stdin)
|
|
|
|
for line in sys.stdin:
|
|
if line == '\n':
|
|
sys.stdout.write(line)
|
|
continue
|
|
out = make_brackets(ET.fromstring(line)).strip()
|
|
sys.stdout.write(out + '\n')
|