// vim:tabstop=2 /*********************************************************************** Moses - factored phrase-based language decoder Copyright (C) 2006 University of Edinburgh This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ***********************************************************************/ #include #include #include #include #include #include #include #include #include #include #include #include "Loader.h" #include "LoaderFactory.h" #include "PhraseDictionaryFuzzyMatch.h" #include "moses/FactorCollection.h" #include "moses/Word.h" #include "moses/Util.h" #include "moses/InputFileStream.h" #include "moses/StaticData.h" #include "moses/WordsRange.h" #include "moses/UserMessage.h" #include "moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerMemoryPerSentence.h" #include "moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.h" #include "moses/TranslationModel/fuzzy-match/SentenceAlignment.h" #include "util/file.hh" #include "util/exception.hh" using namespace std; #if defined __MINGW32__ && !defined mkdtemp #include #include char *mkdtemp(char *tempbuf) { int rand_value = 0; char* tempbase = NULL; char tempbasebuf[MAX_PATH] = ""; if (strcmp(&tempbuf[strlen(tempbuf)-6], "XXXXXX")) { errno = EINVAL; return NULL; } srand((unsigned)time(0)); rand_value = (int)((rand() / ((double)RAND_MAX+1.0)) * 1e6); tempbase = strrchr(tempbuf, '/'); tempbase = tempbase ? tempbase+1 : tempbuf; strcpy(tempbasebuf, tempbase); sprintf(&tempbasebuf[strlen(tempbasebuf)-6], "%d", rand_value); ::GetTempPath(MAX_PATH, tempbuf); strcat(tempbuf, tempbasebuf); ::CreateDirectory(tempbuf, NULL); return tempbuf; } #endif namespace Moses { PhraseDictionaryFuzzyMatch::PhraseDictionaryFuzzyMatch(const std::string &line) :PhraseDictionary(line) ,m_config(3) ,m_FuzzyMatchWrapper(NULL) { ReadParameters(); } PhraseDictionaryFuzzyMatch::~PhraseDictionaryFuzzyMatch() { delete m_FuzzyMatchWrapper; } void PhraseDictionaryFuzzyMatch::Load() { SetFeaturesToApply(); m_FuzzyMatchWrapper = new tmmt::FuzzyMatchWrapper(m_config[0], m_config[1], m_config[2]); } ChartRuleLookupManager *PhraseDictionaryFuzzyMatch::CreateRuleLookupManager( const ChartParser &parser, const ChartCellCollectionBase &cellCollection, std::size_t /*maxChartSpan*/) { return new ChartRuleLookupManagerMemoryPerSentence(parser, cellCollection, *this); } void PhraseDictionaryFuzzyMatch:: SetParameter(const std::string& key, const std::string& value) { if (key == "source") { m_config[0] = value; } else if (key == "target") { m_config[1] = value; } else if (key == "alignment") { m_config[2] = value; } else { PhraseDictionary::SetParameter(key, value); } } int removedirectoryrecursively(const char *dirname) { #if defined __MINGW32__ //TODO(jie): replace this function with boost implementation #else DIR *dir; struct dirent *entry; char path[PATH_MAX]; if (path == NULL) { fprintf(stderr, "Out of memory error\n"); return 0; } dir = opendir(dirname); if (dir == NULL) { perror("Error opendir()"); return 0; } while ((entry = readdir(dir)) != NULL) { if (strcmp(entry->d_name, ".") && strcmp(entry->d_name, "..")) { snprintf(path, (size_t) PATH_MAX, "%s/%s", dirname, entry->d_name); if (entry->d_type == DT_DIR) { removedirectoryrecursively(path); } remove(path); /* * Here, the actual deletion must be done. Beacuse this is * quite a dangerous thing to do, and this program is not very * well tested, we are just printing as if we are deleting. */ //printf("(not really) Deleting: %s\n", path); /* * When you are finished testing this and feel you are ready to do the real * deleting, use this: remove*STUB*(path); * (see "man 3 remove") * Please note that I DONT TAKE RESPONSIBILITY for data you delete with this! */ } } closedir(dir); rmdir(dirname); /* * Now the directory is emtpy, finally delete the directory itself. (Just * printing here, see above) */ //printf("(not really) Deleting: %s\n", dirname); #endif return 1; } void PhraseDictionaryFuzzyMatch::InitializeForInput(InputType const& inputSentence) { #if defined __MINGW32__ char dirName[] = "moses.XXXXXX"; #else char dirName[] = "/tmp/moses.XXXXXX"; #endif // defined char *temp = mkdtemp(dirName); UTIL_THROW_IF2(temp == NULL, "Couldn't create temporary directory " << dirName); string dirNameStr(dirName); string inFileName(dirNameStr + "/in"); ofstream inFile(inFileName.c_str()); for (size_t i = 1; i < inputSentence.GetSize() - 1; ++i) { inFile << inputSentence.GetWord(i); } inFile << endl; inFile.close(); long translationId = inputSentence.GetTranslationId(); string ptFileName = m_FuzzyMatchWrapper->Extract(translationId, dirNameStr); // populate with rules for this sentence PhraseDictionaryNodeMemory &rootNode = m_collection[translationId]; FormatType format = MosesFormat; // data from file InputFileStream inStream(ptFileName); // copied from class LoaderStandard PrintUserTime("Start loading fuzzy-match phrase model"); const StaticData &staticData = StaticData::Instance(); string lineOrig; size_t count = 0; while(getline(inStream, lineOrig)) { const string *line; if (format == HieroFormat) { // reformat line UTIL_THROW(util::Exception, "Cannot be Hiero format"); //line = ReformatHieroRule(lineOrig); } else { // do nothing to format of line line = &lineOrig; } vector tokens; vector scoreVector; TokenizeMultiCharSeparator(tokens, *line , "|||" ); if (tokens.size() != 4 && tokens.size() != 5) { UTIL_THROW2("Syntax error at " << ptFileName << ":" << count); } const string &sourcePhraseString = tokens[0] , &targetPhraseString = tokens[1] , &scoreString = tokens[2] , &alignString = tokens[3]; bool isLHSEmpty = (sourcePhraseString.find_first_not_of(" \t", 0) == string::npos); if (isLHSEmpty && !staticData.IsWordDeletionEnabled()) { TRACE_ERR( ptFileName << ":" << count << ": pt entry contains empty target, skipping\n"); continue; } Tokenize(scoreVector, scoreString); const size_t numScoreComponents = GetNumScoreComponents(); if (scoreVector.size() != numScoreComponents) { UTIL_THROW2("Size of scoreVector != number (" << scoreVector.size() << "!=" << numScoreComponents << ") of score components on line " << count); } UTIL_THROW_IF2(scoreVector.size() != numScoreComponents, "Number of scores incorrectly specified"); // parse source & find pt node // constituent labels Word *sourceLHS; Word *targetLHS; // source Phrase sourcePhrase( 0); sourcePhrase.CreateFromString(Input, m_input, sourcePhraseString, &sourceLHS); // create target phrase obj TargetPhrase *targetPhrase = new TargetPhrase(this); targetPhrase->CreateFromString(Output, m_output, targetPhraseString, &targetLHS); // rest of target phrase targetPhrase->SetAlignmentInfo(alignString); targetPhrase->SetTargetLHS(targetLHS); //targetPhrase->SetDebugOutput(string("New Format pt ") + line); // component score, for n-best output std::transform(scoreVector.begin(),scoreVector.end(),scoreVector.begin(),TransformScore); std::transform(scoreVector.begin(),scoreVector.end(),scoreVector.begin(),FloorScore); targetPhrase->GetScoreBreakdown().Assign(this, scoreVector); targetPhrase->EvaluateInIsolation(sourcePhrase, GetFeaturesToApply()); TargetPhraseCollection &phraseColl = GetOrCreateTargetPhraseCollection(rootNode, sourcePhrase, *targetPhrase, sourceLHS); phraseColl.Add(targetPhrase); count++; if (format == HieroFormat) { // reformat line delete line; } else { // do nothing } } // sort and prune each target phrase collection SortAndPrune(rootNode); //removedirectoryrecursively(dirName); } TargetPhraseCollection &PhraseDictionaryFuzzyMatch::GetOrCreateTargetPhraseCollection(PhraseDictionaryNodeMemory &rootNode , const Phrase &source , const TargetPhrase &target , const Word *sourceLHS) { PhraseDictionaryNodeMemory &currNode = GetOrCreateNode(rootNode, source, target, sourceLHS); return currNode.GetTargetPhraseCollection(); } PhraseDictionaryNodeMemory &PhraseDictionaryFuzzyMatch::GetOrCreateNode(PhraseDictionaryNodeMemory &rootNode , const Phrase &source , const TargetPhrase &target , const Word *sourceLHS) { cerr << source << endl << target << endl; const size_t size = source.GetSize(); const AlignmentInfo &alignmentInfo = target.GetAlignNonTerm(); AlignmentInfo::const_iterator iterAlign = alignmentInfo.begin(); PhraseDictionaryNodeMemory *currNode = &rootNode; for (size_t pos = 0 ; pos < size ; ++pos) { const Word& word = source.GetWord(pos); if (word.IsNonTerminal()) { // indexed by source label 1st const Word &sourceNonTerm = word; UTIL_THROW_IF2(iterAlign == alignmentInfo.end(), "No alignment for non-term at position " << pos); UTIL_THROW_IF2(iterAlign->first != pos, "Alignment info incorrect at position " << pos); size_t targetNonTermInd = iterAlign->second; ++iterAlign; const Word &targetNonTerm = target.GetWord(targetNonTermInd); #if defined(UNLABELLED_SOURCE) currNode = currNode->GetOrCreateNonTerminalChild(targetNonTerm); #else currNode = currNode->GetOrCreateChild(sourceNonTerm, targetNonTerm); #endif } else { currNode = currNode->GetOrCreateChild(word); } UTIL_THROW_IF2(currNode == NULL, "Node not found at position " << pos); } // finally, the source LHS //currNode = currNode->GetOrCreateChild(sourceLHS); return *currNode; } void PhraseDictionaryFuzzyMatch::SortAndPrune(PhraseDictionaryNodeMemory &rootNode) { if (GetTableLimit()) { rootNode.Sort(GetTableLimit()); } } void PhraseDictionaryFuzzyMatch::CleanUpAfterSentenceProcessing(const InputType &source) { m_collection.erase(source.GetTranslationId()); } const PhraseDictionaryNodeMemory &PhraseDictionaryFuzzyMatch::GetRootNode(long translationId) const { std::map::const_iterator iter = m_collection.find(translationId); UTIL_THROW_IF2(iter == m_collection.end(), "Couldn't find root node for input: " << translationId); return iter->second; } PhraseDictionaryNodeMemory &PhraseDictionaryFuzzyMatch::GetRootNode(const InputType &source) { long transId = source.GetTranslationId(); std::map::iterator iter = m_collection.find(transId); UTIL_THROW_IF2(iter == m_collection.end(), "Couldn't find root node for input: " << transId); return iter->second; } TO_STRING_BODY(PhraseDictionaryFuzzyMatch); // friend ostream& operator<<(ostream& out, const PhraseDictionaryFuzzyMatch& phraseDict) { typedef PhraseDictionaryNodeMemory::TerminalMap TermMap; typedef PhraseDictionaryNodeMemory::NonTerminalMap NonTermMap; /* const PhraseDictionaryNodeMemory &coll = phraseDict.m_collection; for (NonTermMap::const_iterator p = coll.m_nonTermMap.begin(); p != coll.m_nonTermMap.end(); ++p) { const Word &sourceNonTerm = p->first.first; out << sourceNonTerm; } for (TermMap::const_iterator p = coll.m_sourceTermMap.begin(); p != coll.m_sourceTermMap.end(); ++p) { const Word &sourceTerm = p->first; out << sourceTerm; } */ return out; } }