mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-12-27 05:55:02 +03:00
makemteval and small change to tokenizer. /Tom Hoar and Tomas Fulajtar
This commit is contained in:
parent
7aa4d5d8d5
commit
c0be182bfa
12
contrib/makemteval/makemteval.ini
Normal file
12
contrib/makemteval/makemteval.ini
Normal file
@ -0,0 +1,12 @@
|
|||||||
|
[set]
|
||||||
|
filein=
|
||||||
|
fileout=
|
||||||
|
settype=
|
||||||
|
srclang=
|
||||||
|
tstlang=
|
||||||
|
setid=SetID
|
||||||
|
refid=RefID
|
||||||
|
sysid=SysID
|
||||||
|
docid=DocID
|
||||||
|
genre=Genre
|
||||||
|
|
253
contrib/makemteval/makemteval.py
Normal file
253
contrib/makemteval/makemteval.py
Normal file
@ -0,0 +1,253 @@
|
|||||||
|
#! /usr/bin/env python
|
||||||
|
# -*- coding: utf8 -*-
|
||||||
|
|
||||||
|
#===============================================================================
|
||||||
|
# Author: Walapa Muangjeen
|
||||||
|
#===============================================================================
|
||||||
|
|
||||||
|
|
||||||
|
__version__ = '2.0'
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
import codecs
|
||||||
|
import ConfigParser
|
||||||
|
from optparse import OptionParser
|
||||||
|
from copy import deepcopy
|
||||||
|
|
||||||
|
|
||||||
|
class makemteval:
|
||||||
|
|
||||||
|
def __init__(self, config=None):
|
||||||
|
|
||||||
|
if isinstance(config,dict):
|
||||||
|
self.config = deepcopy(config)
|
||||||
|
else:
|
||||||
|
self.config = {
|
||||||
|
'filein': None,
|
||||||
|
'fileout': None,
|
||||||
|
'settype': None,
|
||||||
|
'srclang': None,
|
||||||
|
'tstlang': None,
|
||||||
|
'setid': 'SetID',
|
||||||
|
'refid': 'RefID',
|
||||||
|
'sysid': 'SysID',
|
||||||
|
'docid': 'DocID',
|
||||||
|
'genre': 'Genre',
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def parseini(self, config=None, inifile=None, section='set'):
|
||||||
|
|
||||||
|
if inifile is None:
|
||||||
|
inifile = os.path.abspath(os.path.dirname(sys.argv[0])) + os.sep + os.path.splitext(os.path.basename(sys.argv[0]))[0] + '.ini'
|
||||||
|
|
||||||
|
if config is None:
|
||||||
|
config = self.config
|
||||||
|
|
||||||
|
cfgparser = ConfigParser.RawConfigParser()
|
||||||
|
|
||||||
|
if not cfgparser.has_section(section):
|
||||||
|
cfgparser.add_section(section)
|
||||||
|
|
||||||
|
for option in config:
|
||||||
|
cfgparser.set(section, option, config[option])
|
||||||
|
|
||||||
|
cfgparser.read(inifile)
|
||||||
|
|
||||||
|
for option in cfgparser.options(section):
|
||||||
|
config[option] = cfgparser.get(section, option)
|
||||||
|
|
||||||
|
return deepcopy(config)
|
||||||
|
|
||||||
|
|
||||||
|
def writesgm( self, config ):
|
||||||
|
|
||||||
|
try:
|
||||||
|
filein = codecs.open(os.path.abspath(os.path.expanduser(config['filein'])), "r", 'utf-8-sig')
|
||||||
|
except IOError, ErrorMessage:
|
||||||
|
sys.stderr.write("\n: %s\n"%(ErrorMessage))
|
||||||
|
sys.stderr.write(": End Program\n")
|
||||||
|
return True
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
sys.stderr.write( ": opened \"%s\" for reading\n"%(os.path.basename( config['filein'] )))
|
||||||
|
|
||||||
|
lines = [l.replace('"','\"').replace(''','\'').replace('>','>').replace('<','<').replace('&','&') for l in filein.read().splitlines()]
|
||||||
|
filein.close()
|
||||||
|
lines = [l.replace('&','&').replace('<','<').replace('>','>').replace('\'',''').replace('\"','"') for l in lines]
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
sys.stderr.write(": closed \"%s\"\n"%(os.path.basename( config['filein'] )))
|
||||||
|
|
||||||
|
try:
|
||||||
|
fileout = codecs.open(os.path.abspath(os.path.expanduser(config['fileout'])), "w", 'utf8')
|
||||||
|
except IOError, ErrorMessage:
|
||||||
|
sys.stderr.write("\n: %s\n"%(ErrorMessage))
|
||||||
|
sys.stderr.write(": End Program\n")
|
||||||
|
return True
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
sys.stderr.write(": opened \"%s\" for writing\n"%(os.path.basename( config['fileout'] )))
|
||||||
|
|
||||||
|
contents = []
|
||||||
|
contents.append('<?xml version=\"1.0\" encoding=\"UTF-8\"?>')
|
||||||
|
contents.append('<!DOCTYPE mteval SYSTEM \"ftp://jaguar.ncsl.nist.gov/mt/resources/mteval-xml-v1.3.dtd\">')
|
||||||
|
contents.append('<mteval>')
|
||||||
|
|
||||||
|
if config['settype'] == "srcset":
|
||||||
|
contents.append("<%(settype)s setid=\"%(setid)s\" srclang=\"%(srclang)s\">"%(config))
|
||||||
|
|
||||||
|
elif config['settype'] == "refset":
|
||||||
|
contents.append('<%(settype)s setid=\"%(setid)s\" srclang=\"%(srclang)s\" trglang=\"%(tstlang)s\" refid=\"%(refid)s\">'%(config))
|
||||||
|
|
||||||
|
elif config['settype'] == "tstset":
|
||||||
|
contents.append('<%(settype)s setid=\"%(setid)s\" srclang=\"%(srclang)s\" trglang=\"%(tstlang)s\" sysid=\"%(sysid)s\" sysbleu=\"%(sysbleu)s\" language=\"%(language)s\">'%(config))
|
||||||
|
|
||||||
|
else:
|
||||||
|
fileout.close()
|
||||||
|
os.unlink(os.path.abspath(os.path.expanduser(config['fileout'])))
|
||||||
|
sys.stderr.write("\n: Invalid \"settype\" value %s\n"%(config['settype']))
|
||||||
|
sys.stderr.write(": End Program\n")
|
||||||
|
return True
|
||||||
|
|
||||||
|
contents.append('<DOC %sdocid=\"%s\" genre=\"%s\">'%('' if config['settype'] == "srcset" else 'sysid=\"%s\" '%(config['sysid']),config['docid'],config['genre']))
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
sys.stderr.write(": added header\n")
|
||||||
|
|
||||||
|
for i in range(len(lines)):
|
||||||
|
contents.append('<seg id=\"%d\"> %s </seg>'%(i+1,lines[i]))
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
sys.stderr.write(": added %d lines\n"%(i+1))
|
||||||
|
|
||||||
|
contents.append('</DOC>')
|
||||||
|
contents.append('</%s>'%(config['settype']))
|
||||||
|
contents.append('</mteval>')
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
sys.stderr.write(": added footer\n")
|
||||||
|
|
||||||
|
fileout.write('%s\n'%('\n'.join(contents)))
|
||||||
|
ferror = fileout.close()
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
sys.stderr.write(": closed \"" + os.path.basename( config['fileout'] ) + "\"\n")
|
||||||
|
|
||||||
|
return ferror
|
||||||
|
|
||||||
|
|
||||||
|
def parsecmd( config = {} ):
|
||||||
|
|
||||||
|
optparser = OptionParser()
|
||||||
|
|
||||||
|
optparser.add_option(
|
||||||
|
"-i", "--filein", dest = "filein", default = config["filein"],
|
||||||
|
help = "UNC path to tokenized input file (required)")
|
||||||
|
|
||||||
|
optparser.add_option(
|
||||||
|
"-o", "--fileout", dest = "fileout", default = config["fileout"],
|
||||||
|
help = "UNC path of fileout file (required)")
|
||||||
|
|
||||||
|
optparser.add_option(
|
||||||
|
"-s", "--srclang", dest = "srclang", default = config["srclang"],
|
||||||
|
help = "2-letter code for source language (required)")
|
||||||
|
|
||||||
|
optparser.add_option(
|
||||||
|
"-t", "--tstlang", dest = "tstlang", default = config["tstlang"],
|
||||||
|
help = "2-letter code for test language (required)")
|
||||||
|
|
||||||
|
optparser.add_option(
|
||||||
|
"-T", "--settype", dest = "settype", default = config["settype"],
|
||||||
|
help = "Use XML tag: srcset, tstset or refset (required)")
|
||||||
|
|
||||||
|
optparser.add_option(
|
||||||
|
"-e", "--setid", dest = "setid", default = config["setid"],
|
||||||
|
help = "Test set ID (default \""+config["setid"]+"\")")
|
||||||
|
|
||||||
|
optparser.add_option(
|
||||||
|
"-d", "--docid", dest = "docid", default = config["docid"],
|
||||||
|
help = "Document ID (default \""+config["docid"]+"\")")
|
||||||
|
|
||||||
|
optparser.add_option(
|
||||||
|
"-r", "--refid", dest = "refid", default = config["refid"],
|
||||||
|
help = "Reference ID (default \""+config["refid"]+"\")")
|
||||||
|
|
||||||
|
optparser.add_option(
|
||||||
|
"-S", "--sysid", dest = "sysid", default = config["sysid"],
|
||||||
|
help = "System ID used to make the test set (default \""+config["sysid"]+"\")")
|
||||||
|
|
||||||
|
optparser.add_option(
|
||||||
|
"-g", "--genre", dest = "genre", default = config["genre"],
|
||||||
|
help = "Genre of the test set and system ID (default \""+config["genre"]+"\")")
|
||||||
|
|
||||||
|
options, commands = optparser.parse_args()
|
||||||
|
|
||||||
|
missing = []
|
||||||
|
for k,v in { "Error: missing --filein" : options.filein,
|
||||||
|
"Error: missing --fileout": options.fileout,
|
||||||
|
"Error: missing --settype": options.settype,
|
||||||
|
"Error: missing --srclang": options.srclang,
|
||||||
|
"Error: missing --tstlang": options.tstlang
|
||||||
|
}.items():
|
||||||
|
if not v:
|
||||||
|
missing.append(k)
|
||||||
|
|
||||||
|
if missing:
|
||||||
|
for msg in missing:
|
||||||
|
sys.stderr.write('%s\n'%(msg))
|
||||||
|
optparser.print_help()
|
||||||
|
exit(1)
|
||||||
|
|
||||||
|
config['filein'] = options.filein
|
||||||
|
config['fileout'] = options.fileout
|
||||||
|
config['settype'] = options.settype
|
||||||
|
config['setid'] = options.setid
|
||||||
|
config['srclang'] = options.srclang
|
||||||
|
config['tstlang'] = options.tstlang
|
||||||
|
config['refid'] = options.refid
|
||||||
|
config['sysid'] = options.sysid
|
||||||
|
config['docid'] = options.docid
|
||||||
|
config['genre'] = options.genre
|
||||||
|
|
||||||
|
sys.stderr.write(": Configuration complete\n")
|
||||||
|
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
|
licensetxt=u'''CorpusFiltergraph™
|
||||||
|
Copyright © 2010-2014 Precision Translation Tools Co., Ltd.
|
||||||
|
|
||||||
|
This module is free software: you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU Lesser General Public License as published by
|
||||||
|
the Free Software Foundation, either version 3 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
This program is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU Lesser General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU Lesser General Public License
|
||||||
|
along with this program. If not, see http://www.gnu.org/licenses/.
|
||||||
|
|
||||||
|
For more information, please contact Precision Translation Tools Pte
|
||||||
|
at: http://www.precisiontranslationtools.com'''
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
|
||||||
|
mksgm = makemteval()
|
||||||
|
|
||||||
|
mksgm.parseini(mksgm.config)
|
||||||
|
|
||||||
|
parsecmd(mksgm.config)
|
||||||
|
|
||||||
|
mksgm.writesgm(mksgm.config)
|
||||||
|
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
sys.exit(main())
|
@ -1,5 +1,5 @@
|
|||||||
<\/?\S+\/?>
|
<\/?\S+\/?>
|
||||||
<\S+( [a-zA-Z0-9]+\=\"?[^\"]\")+ ?\/?>
|
<\S+( [a-zA-Z0-9]+\=\"?[^\"]\")+ ?\/?>
|
||||||
<\S+( [a-zA-Z0-9]+\=\'?[^\']\')+ ?\/?>
|
<\S+( [a-zA-Z0-9]+\=\'?[^\']\')+ ?\/?>
|
||||||
(\w\-\_\.)+\@((\w\-\_)+\.)+[a-zA-Z]{2,}
|
[\w\-\_\.]+\@([\w\-\_]+\.)+[a-zA-Z]{2,}
|
||||||
(http[s]?|ftp):\/\/[^:\/\s]+(\/\w+)*\/[\w\-\.]+
|
(http[s]?|ftp):\/\/[^:\/\s]+(\/\w+)*\/[\w\-\.]+
|
||||||
|
Loading…
Reference in New Issue
Block a user