mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2025-01-07 12:10:36 +03:00
167 lines
7.0 KiB
Python
Executable File
167 lines
7.0 KiB
Python
Executable File
#! /usr/bin/env python
|
|
# -*- coding: utf_8 -*-
|
|
"""This program is used to prepare TMX files from corpora composed of 2 files for each language pair,
|
|
where the position of a segment in the first language file is exactly the same as in the second
|
|
language file.
|
|
|
|
The program requires that Pythoncard and wxPython (as well as Python) be previously installed.
|
|
|
|
Copyright 2009, 2010 João Luís A. C. Rosas
|
|
|
|
Distributed under GNU GPL v3 licence (see http://www.gnu.org/licenses/)
|
|
|
|
E-mail: joao.luis.rosas@gmail.com """
|
|
|
|
__version__ = "$Revision: 1.033$"
|
|
__date__ = "$Date: 2010/02/25$"
|
|
__author__="$João Luís A. C. Rosas$"
|
|
|
|
from PythonCard import clipboard, dialog, graphic, model
|
|
from PythonCard.components import button, combobox,statictext,checkbox,staticbox
|
|
import wx
|
|
import os, re
|
|
import string
|
|
import sys
|
|
from time import strftime
|
|
import codecs
|
|
|
|
class Moses2TMX(model.Background):
|
|
|
|
def on_initialize(self, event):
|
|
self.inputdir=''
|
|
#Get directory where program file is and ...
|
|
currdir=os.path.abspath(os.path.dirname(os.path.realpath(sys.argv[0])))
|
|
#... load the file ("LanguageCodes.txt") with the list of languages that the program can process
|
|
try:
|
|
self.languages=open(currdir+os.sep+r'LanguageCodes.txt','r+').readlines()
|
|
except:
|
|
# If the languages file doesn't exist in the program directory, alert user that it is essential for the good working of the program and exit
|
|
result = dialog.alertDialog(self, 'The file "LanguageCodes.txt" is missing. The program will now close.', 'Essential file missing')
|
|
sys.exit()
|
|
#remove end of line marker from each line in "LanguageCodes.txt"
|
|
for lang in range(len(self.languages)):
|
|
self.languages[lang]=self.languages[lang].rstrip()
|
|
self.lang1code=''
|
|
self.lang2code=''
|
|
#Insert list of language names in appropriate program window's combo boxes
|
|
self.components.cbStartingLanguage.items=self.languages
|
|
self.components.cbDestinationLanguage.items=self.languages
|
|
|
|
def CreateTMX(self, name):
|
|
print 'Started at '+strftime('%H-%M-%S')
|
|
#get the startinglanguage name (e.g.: "EN-GB") from the program window
|
|
self.lang1code=self.components.cbStartingLanguage.text
|
|
#get the destinationlanguage name from the program window
|
|
self.lang2code=self.components.cbDestinationLanguage.text
|
|
print name+'.'+self.lang2code[:2].lower()
|
|
e=codecs.open(name,'r',"utf-8","strict")
|
|
f=codecs.open(name+'.'+self.lang2code[:2].lower()+'.moses','r',"utf-8","strict")
|
|
a=codecs.open(name+'.tmp','w',"utf-8","strict")
|
|
b=codecs.open(name+'.'+self.lang2code[:2].lower()+'.moses.tmp','w',"utf-8","strict")
|
|
for line in e:
|
|
if line.strip():
|
|
a.write(line)
|
|
for line in f:
|
|
if line.strip():
|
|
b.write(line)
|
|
a=codecs.open(name+'.tmp','r',"utf-8","strict")
|
|
b=codecs.open(name+'.'+self.lang2code[:2].lower()+'.moses.tmp','r',"utf-8","strict")
|
|
g=codecs.open(name+'.tmx','w','utf-16','strict')
|
|
g.write('<?xml version="1.0" ?>\n<!DOCTYPE tmx SYSTEM "tmx14.dtd">\n<tmx version="version 1.4">\n\n<header\ncreationtool="moses2tmx"\ncreationtoolversion="1.032"\nsegtype="sentence"\ndatatype="PlainText"\nadminlang="EN-US"\nsrclang="'+self.lang1code+'"\n>\n</header>\n\n<body>\n')
|
|
parar=0
|
|
while True:
|
|
self.ling1segm=a.readline().strip()
|
|
self.ling2segm=b.readline().strip()
|
|
if not self.ling1segm:
|
|
break
|
|
elif not self.ling2segm:
|
|
break
|
|
else:
|
|
try:
|
|
g.write('<tu creationid="MT!">\n<prop type="Txt::Translator">Moses</prop>\n<tuv xml:lang="'+self.lang1code+'">\n<seg>'+self.ling1segm+'</seg>\n</tuv>\n<tuv xml:lang="'+self.lang2code+ \
|
|
'">\n<seg>'+self.ling2segm+'</seg>\n</tuv>\n</tu>\n\n')
|
|
except:
|
|
pass
|
|
a.close()
|
|
b.close()
|
|
e.close()
|
|
f.close()
|
|
g.write('</body>\n</tmx>\n')
|
|
g.close()
|
|
#os.remove(name)
|
|
#os.remove(name+'.'+self.lang2code[:2].lower()+'.moses')
|
|
os.remove(name+'.tmp')
|
|
os.remove(name+'.'+self.lang2code[:2].lower()+'.moses.tmp')
|
|
|
|
def createTMXs(self):
|
|
try:
|
|
# Get a list of all TMX files that need to be processed
|
|
fileslist=self.locate('*.moses',self.inputdir)
|
|
except:
|
|
# if any error up to now, add the name of the TMX file to the output file @errors
|
|
self.errortypes=self.errortypes+' - Get All Segments: creation of output files error\n'
|
|
if fileslist:
|
|
# For each relevant TMX file ...
|
|
for self.presentfile in fileslist:
|
|
filename=self.presentfile[:-9]
|
|
#print filename
|
|
self.CreateTMX(filename)
|
|
print 'Finished at '+strftime('%H-%M-%S')
|
|
result = dialog.alertDialog(self, 'Processing done.', 'Processing Done')
|
|
|
|
def on_btnCreateTMX_mouseClick(self, event):
|
|
self.createTMXs()
|
|
|
|
def on_menuFileCreateTMXFiles_select(self, event):
|
|
self.createTMXs()
|
|
|
|
def on_btnSelectLang1File_mouseClick(self, event):
|
|
self.input1=self.GetInputFileName()
|
|
|
|
def on_btnSelectLang2File_mouseClick(self, event):
|
|
self.input2=self.GetInputFileName()
|
|
|
|
def locate(self,pattern, basedir):
|
|
"""Locate all files matching supplied filename pattern in and below
|
|
supplied root directory.
|
|
|
|
@pattern: something like '*.tmx'
|
|
@basedir:whole directory to be treated
|
|
"""
|
|
import fnmatch
|
|
for path, dirs, files in os.walk(os.path.abspath(basedir)):
|
|
for filename in fnmatch.filter(files, pattern):
|
|
yield os.path.join(path, filename)
|
|
|
|
def SelectDirectory(self):
|
|
"""Select the directory where the files to be processed are
|
|
|
|
@result: object returned by the dialog window with attributes accepted (true if user clicked OK button, false otherwise) and
|
|
path (list of strings containing the full pathnames to all files selected by the user)
|
|
@self.inputdir: directory where files to be processed are (and where output files will be written)
|
|
@self.statusBar.text: text displayed in the program window status bar"""
|
|
|
|
result= dialog.directoryDialog(self, 'Choose a directory', 'a')
|
|
if result.accepted:
|
|
self.inputdir=result.path
|
|
self.statusBar.text=self.inputdir+' selected.'
|
|
|
|
def on_menuFileSelectDirectory_select(self, event):
|
|
self.SelectDirectory()
|
|
|
|
def on_btnSelectDirectory_mouseClick(self, event):
|
|
self.SelectDirectory()
|
|
|
|
def on_menuHelpShowHelp_select(self, event):
|
|
f = open('_READ_ME_FIRST.txt', "r")
|
|
msg = f.read()
|
|
result = dialog.scrolledMessageDialog(self, msg, '_READ_ME_FIRST.txt')
|
|
|
|
def on_menuFileExit_select(self, event):
|
|
sys.exit()
|
|
|
|
|
|
if __name__ == '__main__':
|
|
app = model.Application(Moses2TMX)
|
|
app.MainLoop()
|