mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-12-30 23:42:30 +03:00
251 lines
8.5 KiB
Python
Executable File
251 lines
8.5 KiB
Python
Executable File
#!/usr/bin/env python
|
|
#
|
|
# Picaro: An simple command-line alignment visualization tool.
|
|
#
|
|
# picaro.py
|
|
# Visualize alignments between sentences in a grid format.
|
|
#
|
|
# Jason Riesa <riesa@isi.edu>
|
|
# version: 01-16-2010
|
|
#
|
|
# Copyright (C) 2013 Jason Riesa
|
|
#
|
|
# This library is free software; you can redistribute it and/or
|
|
# modify it under the terms of the GNU Lesser General Public
|
|
# License as published by the Free Software Foundation; either
|
|
# version 2.1 of the License, or (at your option) any later version.
|
|
#
|
|
# This library is distributed in the hope that it will be useful,
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
# Lesser General Public License for more details.
|
|
#
|
|
# You should have received a copy of the GNU Lesser General Public
|
|
# License along with this library; if not, write to the Free Software
|
|
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
|
|
import sys, os, commands
|
|
from collections import defaultdict
|
|
|
|
#TC_BIN = "tc/tc.linux32"
|
|
|
|
a1_file_str = ""
|
|
a2_file_str = ""
|
|
f_file_str = ""
|
|
e_file_str = ""
|
|
SHOW_TC_A1 = 0
|
|
SHOW_TC_A2 = 0
|
|
maxlen = float('inf')
|
|
|
|
# Process command line options
|
|
try:
|
|
while len(sys.argv) > 1:
|
|
option = sys.argv[1]; del sys.argv[1]
|
|
if option == '-a1':
|
|
a1_file_str = sys.argv[1]; del sys.argv[1]
|
|
elif option == '-a2':
|
|
a2_file_str = sys.argv[1]; del sys.argv[1]
|
|
elif option == '-f':
|
|
f_file_str = sys.argv[1]; del sys.argv[1]
|
|
elif option == '-e':
|
|
e_file_str = sys.argv[1]; del sys.argv[1]
|
|
elif option == '-maxlen':
|
|
maxlen = int(sys.argv[1]); del sys.argv[1]
|
|
else:
|
|
sys.stderr.write("Invalid option: %s\n" % (option))
|
|
sys.exit(1)
|
|
'''
|
|
elif option == '-tc':
|
|
if sys.argv[1] == '1':
|
|
SHOW_TC_A1 = 1; del sys.argv[1]
|
|
elif sys.argv[1] == '2':
|
|
SHOW_TC_A2 = 2; del sys.argv[1]
|
|
else:
|
|
raise Exception, "Invalid argument to option -tc"
|
|
'''
|
|
|
|
if a1_file_str == "" or f_file_str == "" or e_file_str == "":
|
|
raise Exception, "Not all options properly specified."
|
|
# Make sure transitive closure binary exists if user has enabled this option
|
|
if SHOW_TC_A1 or SHOW_TC_A2:
|
|
if not os.path.exists(TC_BIN):
|
|
raise Exception, "Transitive closure binary "+TC_BIN+" not found."
|
|
except Exception, msg:
|
|
sys.stderr.write("%s: %s\n" % (sys.argv[0], msg))
|
|
sys.stderr.write("Usage: %s: -a1 <alignment1> -f <f> -e <e> [-a2 <alignment2>]\n" % (sys.argv[0]))
|
|
sys.stderr.write("Mandatory arguments:\n")
|
|
sys.stderr.write(" -a1 <a1>\t path to alignment 1 file in f-e format\n")
|
|
sys.stderr.write(" -f <f>\t\t path to source text f\n")
|
|
sys.stderr.write(" -e <e>\t\t path to target text e\n")
|
|
sys.stderr.write("Optional arguments:\n")
|
|
sys.stderr.write(" -a2 <a2>\t path to alignment 2 file in f-e format\n")
|
|
sys.stderr.write(" -maxlen <len>\t display alignment only when e and f have length <= len\n")
|
|
sys.exit(1)
|
|
|
|
|
|
a_file = open(a1_file_str, 'r')
|
|
f_file = open(f_file_str, 'r')
|
|
e_file = open(e_file_str, 'r')
|
|
if a2_file_str != "":
|
|
a2_file = open(a2_file_str, 'r')
|
|
|
|
sentenceNumber = 0
|
|
nextRequested = 1
|
|
for aline in a_file:
|
|
eline = e_file.readline()
|
|
fline = f_file.readline()
|
|
if a2_file_str != "":
|
|
a2line = a2_file.readline()
|
|
|
|
links = aline.split()
|
|
e_words = eline.split()
|
|
f_words = fline.split()
|
|
if a2_file_str != "":
|
|
links2 = a2line.split()
|
|
|
|
# Get transitive closure of links and links2
|
|
if SHOW_TC_A1:
|
|
cmd = 'echo "' + ' '.join(links) + '" | ' + TC_BIN
|
|
failure1, output1 = commands.getstatusoutput(cmd)
|
|
tc1 = output1.split()
|
|
if SHOW_TC_A2:
|
|
cmd = 'echo "' + ' '.join(links2) + '" | ' + TC_BIN
|
|
failure2, output2 = commands.getstatusoutput(cmd)
|
|
tc2 = output2.split()
|
|
|
|
# Update tracking counts
|
|
sentenceNumber += 1
|
|
if sentenceNumber < nextRequested:
|
|
continue
|
|
|
|
# Don't generate alignment grids for very large sentences
|
|
if len(e_words) > maxlen or len(f_words) > maxlen:
|
|
continue
|
|
|
|
|
|
print "== SENTENCE ",sentenceNumber," =="
|
|
|
|
# Initialize alignment objects
|
|
# a holds alignments of user-specified -a1 <file>
|
|
# a2 holds alignments of user-specified -a2 <file>
|
|
a = defaultdict(lambda: defaultdict(int))
|
|
a2 = defaultdict(lambda: defaultdict(int))
|
|
|
|
# Print e_words on the columns
|
|
# First, find the length of the longest word
|
|
longestEWordSize = 0
|
|
longestEWord = 0
|
|
for w in e_words:
|
|
if len(w) > longestEWordSize:
|
|
longestEWordSize = len(w)
|
|
longestEWord = w
|
|
|
|
# Now, print the e-words
|
|
for i in range(longestEWordSize, 0, -1):
|
|
for w in e_words:
|
|
if len(w) < i:
|
|
print " ",
|
|
else:
|
|
print w[(i*-1)],
|
|
print
|
|
|
|
|
|
# Fill in alignment matrix 1
|
|
for link in links:
|
|
i, j = map(int, link.split('-'))
|
|
a[int(i)][int(j)] = 1
|
|
# Fill in extra links added by transitive closure
|
|
if SHOW_TC_A1:
|
|
for link in tc1:
|
|
i, j = map(int, link.split('-'))
|
|
if(a[i][j] != 1):
|
|
a[i][j] = 2
|
|
|
|
# Fill in alignment matrix 2
|
|
if(a2_file_str != ""):
|
|
for link in links2:
|
|
i, j = map(int, link.split('-'))
|
|
a2[i][j] = 1
|
|
# Fill in extra links added by transitive closure
|
|
if SHOW_TC_A2:
|
|
for link in tc2:
|
|
i, j = map(int, link.split('-'))
|
|
if(a2[i][j] != 1):
|
|
a2[i][j] = 2
|
|
|
|
# Print filled-in alignment matrix
|
|
if a2_file_str == "":
|
|
for i, _ in enumerate(f_words):
|
|
for j, _ in enumerate(e_words):
|
|
val1 = a[i][j]
|
|
if val1 == 0:
|
|
# No link
|
|
print ':',
|
|
elif val1 == 1:
|
|
# Regular link
|
|
print u'\u001b[44m\u0020\u001b[0m',
|
|
elif val1 == 2:
|
|
# Link due to transitive closure
|
|
# Render as gray-shaded square
|
|
print 'O',
|
|
print f_words[i]
|
|
print
|
|
else:
|
|
for i, _ in enumerate(f_words):
|
|
for j, _ in enumerate(e_words):
|
|
val1 = a[i][j]
|
|
val2 = a2[i][j]
|
|
|
|
if val1 == 0 and val2 == 0:
|
|
# Link not in a nor a2
|
|
# Empty grid box
|
|
print ':',
|
|
# Link in both a and a2
|
|
elif val1 > 0 and val2 > 0:
|
|
# Green box
|
|
if val1 == 1:
|
|
if val2 == 1:
|
|
print u'\u001b[42m\u001b[1m\u0020\u001b[0m',
|
|
elif val2 == 2:
|
|
print u'\u001b[42m\u001b[30m2\u001b[0m',
|
|
elif val1 == 2:
|
|
if val2 == 1:
|
|
print u'\u001b[42m\u0020\u001b[0m',
|
|
elif val2 == 2:
|
|
print u'\u001b[42m\u001b[30m3\u001b[0m',
|
|
# Link in a2, but not a
|
|
elif val1 == 0 and val2 > 0:
|
|
if val2 == 1:
|
|
# Yellow box
|
|
print u'\u001b[1m\u001b[43m\u0020\u001b[0m',
|
|
elif val2 == 2:
|
|
# Artificial link by transitive closure
|
|
print u'\u001b[43m\u001b[30m2\u001b[0m',
|
|
|
|
# Link in a, but not a2
|
|
elif val1 > 0 and val2 == 0:
|
|
if val1 == 1:
|
|
# Blue box
|
|
print u'\u001b[1m\u001b[44m\u0020\u001b[0m',
|
|
elif val1 == 2:
|
|
print u'\u001b[44m\u001b[37m1\u001b[0m',
|
|
print f_words[i]
|
|
nextDefault = sentenceNumber + 1
|
|
sys.stdout.write("Enter next alignment number or 'q' to quit [%d]: " %(nextDefault))
|
|
user_input = sys.stdin.readline().strip()
|
|
if user_input == "":
|
|
nextRequested = nextDefault
|
|
elif user_input[0] == "q" or user_input == "quit":
|
|
sys.exit(1)
|
|
else:
|
|
try:
|
|
nextRequested = int(user_input)
|
|
except:
|
|
nextRequested = sentenceNumber + 1
|
|
sys.stdout.write("Unknown alignment id: %s\nContinuing with %d.\n" %(user_input, nextRequested))
|
|
|
|
a_file.close()
|
|
e_file.close()
|
|
f_file.close()
|
|
|