mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2025-01-07 12:10:36 +03:00
69c6164b82
normalisation of test data git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@1673 1f5c12ca-751b-0410-a591-d2e778427230
73 lines
2.5 KiB
Python
Executable File
73 lines
2.5 KiB
Python
Executable File
#!/usr/bin/python
|
|
|
|
#
|
|
# Normalise the references or nbest list, prior to statistic and feature extraction
|
|
#
|
|
|
|
import optparse,sys, math, re, xml.sax.saxutils
|
|
|
|
preserve_case = True
|
|
|
|
normalize1 = [
|
|
('<skipped>', ''), # strip "skipped" tags
|
|
(r'-\n', ''), # strip end-of-line hyphenation and join lines
|
|
(r'\n', ' '), # join lines
|
|
# (r'(\d)\s+(?=\d)', r'\1'), # join digits
|
|
]
|
|
normalize1 = [(re.compile(pattern), replace) for (pattern, replace) in normalize1]
|
|
|
|
normalize2 = [
|
|
(r'([\{-\~\[-\` -\&\(-\+\:-\@\/])',r' \1 '), # tokenize punctuation. apostrophe is missing
|
|
(r'([^0-9])([\.,])',r'\1 \2 '), # tokenize period and comma unless preceded by a digit
|
|
(r'([\.,])([^0-9])',r' \1 \2'), # tokenize period and comma unless followed by a digit
|
|
(r'([0-9])(-)',r'\1 \2 ') # tokenize dash when preceded by a digit
|
|
]
|
|
normalize2 = [(re.compile(pattern), replace) for (pattern, replace) in normalize2]
|
|
|
|
def normalize(s):
|
|
'''Normalize and tokenize text. This is lifted from NIST mteval-v11a.pl.'''
|
|
# Added to bypass NIST-style pre-processing of hyp and ref files -- wade
|
|
if type(s) is not str:
|
|
s = " ".join(s)
|
|
# language-independent part:
|
|
for (pattern, replace) in normalize1:
|
|
s = re.sub(pattern, replace, s)
|
|
s = xml.sax.saxutils.unescape(s, {'"':'"'})
|
|
# language-dependent part (assuming Western languages):
|
|
s = " %s " % s
|
|
if not preserve_case:
|
|
s = s.lower() # this might not be identical to the original
|
|
for (pattern, replace) in normalize2:
|
|
s = re.sub(pattern, replace, s)
|
|
return s.split()
|
|
|
|
def process_nbest():
|
|
print>>sys.stderr, "Processing nbest file"
|
|
for line in sys.stdin:
|
|
sep = "||| "
|
|
fields = line[:-1].split(sep)
|
|
normalised = normalize(fields[1])
|
|
fields[1] = " ".join(normalised) + " "
|
|
print>>sys.stdout,sep.join(fields)
|
|
|
|
|
|
def process_refs():
|
|
print>>sys.stderr, "Processing text file"
|
|
for line in sys.stdin:
|
|
normalised = normalize(line[:-1])
|
|
print>>sys.stdout,(" ".join(normalised))
|
|
|
|
def main():
|
|
parser = optparse.OptionParser(usage="usage: %prog [options] < input > output")
|
|
parser.add_option("-n","--nbest",action="store_true",default=False,dest="nbest",
|
|
help="Process nbest file")
|
|
(options,args) = parser.parse_args()
|
|
if options.nbest:
|
|
process_nbest()
|
|
else:
|
|
process_refs()
|
|
|
|
if __name__ == "__main__":
|
|
main()
|
|
|