2013-12-02 19:29:13 +04:00
|
|
|
// vim:tabstop=2
|
2013-12-04 22:47:33 +04:00
|
|
|
#include <stdlib.h>
|
2013-12-02 19:29:13 +04:00
|
|
|
#include "TransliterationPhraseDictionary.h"
|
|
|
|
#include "moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerSkeleton.h"
|
|
|
|
|
|
|
|
using namespace std;
|
|
|
|
|
|
|
|
namespace Moses
|
|
|
|
{
|
|
|
|
TransliterationPhraseDictionary::TransliterationPhraseDictionary(const std::string &line)
|
|
|
|
: PhraseDictionary(line)
|
|
|
|
{
|
|
|
|
ReadParameters();
|
|
|
|
}
|
|
|
|
|
|
|
|
void TransliterationPhraseDictionary::CleanUpAfterSentenceProcessing(const InputType& source)
|
|
|
|
{
|
|
|
|
RemoveAllInColl(m_allTPColl);
|
|
|
|
}
|
|
|
|
|
|
|
|
void TransliterationPhraseDictionary::GetTargetPhraseCollectionBatch(const InputPathList &inputPathQueue) const
|
|
|
|
{
|
2013-12-04 22:47:33 +04:00
|
|
|
string mosesDir = "/home/hieu/workspace/github/mosesdecoder";
|
|
|
|
string scriptDir = mosesDir + "/scripts";
|
|
|
|
string externalDir = "/home/hieu/workspace/bin/training-tools";
|
|
|
|
string modelDir = "/home/hieu/workspace/experiment/data/issues/transliteration/Transliteration.3";
|
|
|
|
string inputLang = "en";
|
|
|
|
string outputLang = "ar";
|
|
|
|
|
2013-12-02 19:29:13 +04:00
|
|
|
InputPathList::const_iterator iter;
|
|
|
|
for (iter = inputPathQueue.begin(); iter != inputPathQueue.end(); ++iter) {
|
|
|
|
InputPath &inputPath = **iter;
|
|
|
|
const Phrase &sourcePhrase = inputPath.GetPhrase();
|
|
|
|
|
|
|
|
if (sourcePhrase.GetSize() != 1) {
|
|
|
|
// only translit single words. This should be user configurable
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
InputPath::TargetPhrases::const_iterator iter;
|
|
|
|
for (iter = inputPath.GetTargetPhrases().begin(); iter != inputPath.GetTargetPhrases().end(); ++iter) {
|
|
|
|
const std::pair<const TargetPhraseCollection*, const void*> &temp = iter->second;
|
2013-12-02 19:31:15 +04:00
|
|
|
const TargetPhraseCollection *tpCollPrev = temp.first;
|
2013-12-02 19:29:13 +04:00
|
|
|
|
2013-12-02 19:31:15 +04:00
|
|
|
if (tpCollPrev && tpCollPrev->GetSize()) {
|
2013-12-02 19:29:13 +04:00
|
|
|
// already have translation from another pt. Don't transliterate
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
// TRANSLITERATE
|
2013-12-04 22:47:33 +04:00
|
|
|
char *ptr = tmpnam(NULL);
|
|
|
|
string inFile(ptr);
|
|
|
|
ptr = tmpnam(NULL);
|
|
|
|
string outDir(ptr);
|
|
|
|
|
|
|
|
ofstream inStream(inFile.c_str());
|
|
|
|
inStream << sourcePhrase.ToString() << endl;
|
|
|
|
inStream.close();
|
|
|
|
|
|
|
|
string cmd = scriptDir + "/Transliteration/prepare-transliteration-phrase-table.pl" +
|
|
|
|
" --transliteration-model-dir " + modelDir +
|
|
|
|
" --moses-src-dir " + mosesDir +
|
|
|
|
" --external-bin-dir " + externalDir +
|
|
|
|
" --input-extension " + inputLang +
|
|
|
|
" --output-extension " + outputLang +
|
|
|
|
" --oov-file " + inFile +
|
|
|
|
" --out-dir " + outDir;
|
|
|
|
|
|
|
|
int ret = system(cmd.c_str());
|
|
|
|
UTIL_THROW_IF2(ret != 0, "Transliteration script error");
|
|
|
|
|
2013-12-02 19:29:13 +04:00
|
|
|
TargetPhraseCollection *tpColl = new TargetPhraseCollection();
|
2013-12-04 22:47:33 +04:00
|
|
|
vector<TargetPhrase*> targetPhrases = CreateTargetPhrases(sourcePhrase, outDir);
|
|
|
|
vector<TargetPhrase*>::const_iterator iter;
|
|
|
|
for (iter = targetPhrases.begin(); iter != targetPhrases.end(); ++iter) {
|
|
|
|
TargetPhrase *tp = *iter;
|
|
|
|
tpColl->Add(tp);
|
|
|
|
}
|
2013-12-02 19:29:13 +04:00
|
|
|
|
|
|
|
m_allTPColl.push_back(tpColl);
|
|
|
|
inputPath.SetTargetPhrases(*this, tpColl, NULL);
|
|
|
|
|
2013-12-04 22:47:33 +04:00
|
|
|
remove(inFile.c_str());
|
|
|
|
|
|
|
|
cmd = "rm -rf " + outDir;
|
|
|
|
system(cmd.c_str());
|
2013-12-02 19:29:13 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2013-12-04 22:47:33 +04:00
|
|
|
std::vector<TargetPhrase*> TransliterationPhraseDictionary::CreateTargetPhrases(const Phrase &sourcePhrase, const string &outDir) const
|
2013-12-02 19:29:13 +04:00
|
|
|
{
|
2013-12-04 22:47:33 +04:00
|
|
|
std::vector<TargetPhrase*> ret;
|
|
|
|
|
|
|
|
string outPath = outDir + "/out.txt";
|
|
|
|
ifstream outStream(outPath.c_str());
|
|
|
|
|
|
|
|
string line;
|
|
|
|
while (getline(outStream, line)) {
|
|
|
|
vector<string> toks;
|
|
|
|
Tokenize(toks, line, "\t");
|
|
|
|
UTIL_THROW_IF2(toks.size() != 2, "Error in transliteration output file. Expecting word\tscore");
|
|
|
|
|
|
|
|
TargetPhrase *tp = new TargetPhrase();
|
|
|
|
Word &word = tp->AddWord();
|
|
|
|
word.CreateFromString(Output, m_output, toks[0], false);
|
2013-12-02 19:29:13 +04:00
|
|
|
|
2013-12-04 22:47:33 +04:00
|
|
|
float score = Scan<float>(toks[1]);
|
|
|
|
tp->GetScoreBreakdown().PlusEquals(this, score);
|
2013-12-02 19:29:13 +04:00
|
|
|
|
2013-12-04 22:47:33 +04:00
|
|
|
// score of all other ff when this rule is being loaded
|
|
|
|
tp->Evaluate(sourcePhrase, GetFeaturesToApply());
|
2013-12-02 19:29:13 +04:00
|
|
|
|
2013-12-04 22:47:33 +04:00
|
|
|
ret.push_back(tp);
|
|
|
|
}
|
2013-12-02 19:29:13 +04:00
|
|
|
|
2013-12-04 22:47:33 +04:00
|
|
|
outStream.close();
|
2013-12-02 19:29:13 +04:00
|
|
|
|
2013-12-04 22:47:33 +04:00
|
|
|
return ret;
|
2013-12-02 19:29:13 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
ChartRuleLookupManager* TransliterationPhraseDictionary::CreateRuleLookupManager(const ChartParser &parser,
|
|
|
|
const ChartCellCollectionBase &cellCollection)
|
|
|
|
{
|
|
|
|
return NULL;
|
|
|
|
//return new ChartRuleLookupManagerSkeleton(parser, cellCollection, *this);
|
|
|
|
}
|
|
|
|
|
|
|
|
TO_STRING_BODY(TransliterationPhraseDictionary);
|
|
|
|
|
|
|
|
// friend
|
|
|
|
ostream& operator<<(ostream& out, const TransliterationPhraseDictionary& phraseDict)
|
|
|
|
{
|
|
|
|
return out;
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|