mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-09-20 23:58:15 +03:00
233 lines
7.0 KiB
Python
Executable File
233 lines
7.0 KiB
Python
Executable File
#!/usr/bin/env python
|
|
#
|
|
# This file is part of moses. Its use is licensed under the GNU Lesser General
|
|
# Public License version 2.1 or, at your option, any later version.
|
|
|
|
"""Usage: extract-target-trees.py [FILE]
|
|
|
|
Reads moses-chart's -T output from FILE or standard input and writes trees to
|
|
standard output in Moses' XML tree format.
|
|
"""
|
|
|
|
import re
|
|
import sys
|
|
|
|
|
|
class Tree:
|
|
def __init__(self, label, children):
|
|
self.label = label
|
|
self.children = children
|
|
|
|
def is_leaf(self):
|
|
return len(self.children) == 0
|
|
|
|
|
|
class Derivation(list):
|
|
def find_root(self):
|
|
assert len(self) > 0
|
|
root = None
|
|
for hypothesis in self:
|
|
if hypothesis.span[0] != 0:
|
|
continue
|
|
if root is None or hypothesis.span[1] > root.span[1]:
|
|
root = hypothesis
|
|
assert root
|
|
return root
|
|
|
|
def construct_target_tree(self):
|
|
hypo_map = {}
|
|
for hypothesis in self:
|
|
hypo_map[hypothesis.span] = hypothesis
|
|
root = self.find_root()
|
|
return self._build_tree(root, hypo_map)
|
|
|
|
def _build_tree(self, root, hypo_map):
|
|
def escape_label(label):
|
|
s = label.replace("&", "&")
|
|
s = s.replace("<", "<")
|
|
s = s.replace(">", ">")
|
|
return s
|
|
|
|
# Build list of NT spans in source order...
|
|
non_term_spans = []
|
|
for item in root.source_symbol_info:
|
|
span = item[0]
|
|
# In hypo_map iff symbol is NT:
|
|
if span != root.span and span in hypo_map:
|
|
non_term_spans.append(span)
|
|
non_term_spans.sort()
|
|
|
|
# ... then convert to target order.
|
|
alignment_pairs = root.nt_alignments[:]
|
|
alignment_pairs.sort()
|
|
target_order_non_term_spans = {}
|
|
for i, pair in enumerate(alignment_pairs):
|
|
target_order_non_term_spans[pair[1]] = non_term_spans[i]
|
|
|
|
children = []
|
|
num_non_terms = 0
|
|
|
|
for i, symbol in enumerate(root.target_rhs):
|
|
if i in target_order_non_term_spans:
|
|
hyp = hypo_map[target_order_non_term_spans[i]]
|
|
children.append(self._build_tree(hyp, hypo_map))
|
|
num_non_terms += 1
|
|
else:
|
|
children.append(Tree(escape_label(symbol), []))
|
|
|
|
assert num_non_terms == len(root.nt_alignments)
|
|
|
|
return Tree(root.target_lhs, children)
|
|
|
|
|
|
class Hypothesis:
|
|
def __init__(self):
|
|
self.sentence_num = None
|
|
self.span = None
|
|
self.source_symbol_info = None
|
|
self.target_lhs = None
|
|
self.target_rhs = None
|
|
self.nt_alignments = None
|
|
|
|
|
|
def read_derivations(input):
|
|
line_num = 0
|
|
start_line_num = None
|
|
prev_sentence_num = None
|
|
derivation = Derivation()
|
|
for line in input:
|
|
line_num += 1
|
|
hypothesis = parse_line(line)
|
|
if hypothesis.sentence_num != prev_sentence_num:
|
|
# We've started reading the next derivation...
|
|
prev_sentence_num = hypothesis.sentence_num
|
|
if len(derivation):
|
|
yield derivation, start_line_num
|
|
derivation = Derivation()
|
|
start_line_num = line_num
|
|
derivation.append(hypothesis)
|
|
if len(derivation):
|
|
yield derivation, start_line_num
|
|
|
|
|
|
def parse_line(s):
|
|
if s.startswith("Trans Opt"):
|
|
return parse_line_old_format(s)
|
|
else:
|
|
return parse_line_new_format(s)
|
|
|
|
|
|
# Extract the hypothesis components and return a Hypothesis object.
|
|
def parse_line_old_format(s):
|
|
pattern = r"Trans Opt (\d+) " + \
|
|
r"\[(\d+)\.\.(\d+)\]:" + \
|
|
r"((?: \[\d+\.\.\d+\]=\S+ )+):" + \
|
|
r" (\S+) ->\S+ -> " + \
|
|
r"((?:\S+ )+):" + \
|
|
r"((?:\d+-\d+ )*): c="
|
|
regexp = re.compile(pattern)
|
|
match = regexp.match(s)
|
|
if not match:
|
|
sys.stderr.write("%s\n" % s)
|
|
assert match
|
|
group = match.groups()
|
|
hypothesis = Hypothesis()
|
|
hypothesis.sentence_num = int(group[0]) + 1
|
|
hypothesis.span = (int(group[1]), int(group[2]))
|
|
hypothesis.source_symbol_info = []
|
|
for item in group[3].split():
|
|
pattern = "\[(\d+)\.\.(\d+)\]=(\S+)"
|
|
regexp = re.compile(pattern)
|
|
match = regexp.match(item)
|
|
assert(match)
|
|
start, end, symbol = match.groups()
|
|
span = (int(start), int(end))
|
|
hypothesis.source_symbol_info.append((span, symbol))
|
|
hypothesis.target_lhs = group[4]
|
|
hypothesis.target_rhs = group[5].split()
|
|
hypothesis.nt_alignments = []
|
|
for pair in group[6].split():
|
|
match = re.match(r'(\d+)-(\d+)', pair)
|
|
assert match
|
|
ai = (int(match.group(1)), int(match.group(2)))
|
|
hypothesis.nt_alignments.append(ai)
|
|
return hypothesis
|
|
|
|
|
|
# Extract the hypothesis components and return a Hypothesis object.
|
|
def parse_line_new_format(s):
|
|
pattern = r"(\d+) \|\|\|" + \
|
|
r" (\[\S+\]) -> ((?:\S+ )+)\|\|\|" + \
|
|
r" (\[\S+\]) -> ((?:\S+ )+)\|\|\|" + \
|
|
r" ((?:\d+-\d+ )*)\|\|\|" + \
|
|
r"((?: \d+\.\.\d+)*)"
|
|
regexp = re.compile(pattern)
|
|
match = regexp.match(s)
|
|
if not match:
|
|
sys.stderr.write("%s\n" % s)
|
|
assert match
|
|
group = match.groups()
|
|
hypothesis = Hypothesis()
|
|
hypothesis.sentence_num = int(group[0]) + 1
|
|
spans = []
|
|
for pair in group[6].split():
|
|
match = re.match(r'(\d+)\.\.(\d+)', pair)
|
|
assert match
|
|
span = (int(match.group(1)), int(match.group(2)))
|
|
spans.append(span)
|
|
hypothesis.span = (spans[0][0], spans[-1][1])
|
|
hypothesis.source_symbol_info = []
|
|
for i, symbol in enumerate(group[2].split()):
|
|
hypothesis.source_symbol_info.append((spans[i], strip_brackets(symbol)))
|
|
hypothesis.target_lhs = strip_brackets(group[3])
|
|
hypothesis.target_rhs = group[4].split()
|
|
hypothesis.nt_alignments = []
|
|
for pair in group[5].split():
|
|
match = re.match(r'(\d+)-(\d+)', pair)
|
|
assert match
|
|
ai = (int(match.group(1)), int(match.group(2)))
|
|
hypothesis.nt_alignments.append(ai)
|
|
return hypothesis
|
|
|
|
|
|
def strip_brackets(symbol):
|
|
if symbol[0] == '[' and symbol[-1] == ']':
|
|
return symbol[1:-1]
|
|
return symbol
|
|
|
|
|
|
def tree_to_xml(tree):
|
|
if tree.is_leaf():
|
|
return tree.label
|
|
else:
|
|
s = '<tree label="%s"> ' % tree.label
|
|
for child in tree.children:
|
|
s += tree_to_xml(child)
|
|
s += " "
|
|
s += '</tree>'
|
|
return s
|
|
|
|
|
|
def main():
|
|
if len(sys.argv) > 2:
|
|
sys.stderr.write("usage: %s [FILE]\n" % sys.argv[0])
|
|
sys.exit(1)
|
|
if len(sys.argv) == 1 or sys.argv[1] == "-":
|
|
input = sys.stdin
|
|
else:
|
|
input = open(sys.argv[1])
|
|
for derivation, line_num in read_derivations(input):
|
|
try:
|
|
tree = derivation.construct_target_tree()
|
|
except:
|
|
msg = (
|
|
"error processing derivation starting at line %d\n"
|
|
% line_num)
|
|
sys.stderr.write(msg)
|
|
raise
|
|
print tree_to_xml(tree)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|