/*********************************************************************** Moses - factored phrase-based language decoder Copyright (C) 2009 University of Edinburgh This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ***********************************************************************/ #include #include #include #include #include #include #include #include #include #include #include #include "SafeGetline.h" #include "ScoreFeature.h" #include "tables-core.h" #include "ExtractionPhrasePair.h" #include "score.h" #include "InputFileStream.h" #include "OutputFileStream.h" using namespace std; using namespace MosesTraining; #define LINE_MAX_LENGTH 100000 namespace MosesTraining { LexicalTable lexTable; bool inverseFlag = false; bool hierarchicalFlag = false; bool pcfgFlag = false; bool treeFragmentsFlag = false; bool unpairedExtractFormatFlag = false; bool conditionOnTargetLhsFlag = false; bool wordAlignmentFlag = true; bool goodTuringFlag = false; bool kneserNeyFlag = false; bool logProbFlag = false; int negLogProb = 1; #define COC_MAX 10 bool lexFlag = true; bool unalignedFlag = false; bool unalignedFWFlag = false; bool crossedNonTerm = false; int countOfCounts[COC_MAX+1]; int totalDistinct = 0; float minCountHierarchical = 0; std::map sourceLHSCounts; std::map* > targetLHSAndSourceLHSJointCounts; std::set sourceLabelSet; std::map sourceLabels; std::vector sourceLabelsByIndex; Vocabulary vcbT; Vocabulary vcbS; } // namespace std::vector tokenize( const char [] ); void processLine( std::string line, int lineID, bool includeSentenceIdFlag, int &sentenceId, PHRASE *phraseSource, PHRASE *phraseTarget, ALIGNMENT *targetToSourceAlignment, std::string &additionalPropertiesString, float &count, float &pcfgSum ); void writeCountOfCounts( const std::string &fileNameCountOfCounts ); void processPhrasePairs( std::vector< ExtractionPhrasePair* > &phrasePairsWithSameSource, ostream &phraseTableFile, const ScoreFeatureManager& featureManager, const MaybeLog& maybeLogProb ); void outputPhrasePair(const ExtractionPhrasePair &phrasePair, float, int, ostream &phraseTableFile, const ScoreFeatureManager &featureManager, const MaybeLog &maybeLog ); double computeLexicalTranslation( const PHRASE *phraseSource, const PHRASE *phraseTarget, const ALIGNMENT *alignmentTargetToSource ); double computeUnalignedPenalty( const ALIGNMENT *alignmentTargetToSource ); set functionWordList; void loadFunctionWords( const string &fileNameFunctionWords ); double computeUnalignedFWPenalty( const PHRASE *phraseTarget, const ALIGNMENT *alignmentTargetToSource ); int calcCrossedNonTerm( const PHRASE *phraseTarget, const ALIGNMENT *alignmentTargetToSource ); void printSourcePhrase( const PHRASE *phraseSource, const PHRASE *phraseTarget, const ALIGNMENT *targetToSourceAlignment, ostream &out ); void printTargetPhrase( const PHRASE *phraseSource, const PHRASE *phraseTarget, const ALIGNMENT *targetToSourceAlignment, ostream &out ); void invertAlignment( const PHRASE *phraseSource, const PHRASE *phraseTarget, const ALIGNMENT *inTargetToSourceAlignment, ALIGNMENT *outSourceToTargetAlignment ); int main(int argc, char* argv[]) { std::cerr << "Score v2.1 -- " << "scoring methods for extracted rules" << std::endl; ScoreFeatureManager featureManager; if (argc < 4) { std::cerr << "syntax: score extract lex phrase-table [--Inverse] [--Hierarchical] [--LogProb] [--NegLogProb] [--NoLex] [--GoodTuring] [--KneserNey] [--NoWordAlignment] [--UnalignedPenalty] [--UnalignedFunctionWordPenalty function-word-file] [--MinCountHierarchical count] [--PCFG] [--TreeFragments] [--UnpairedExtractFormat] [--ConditionOnTargetLHS] [--CrossedNonTerm]" << std::endl; std::cerr << featureManager.usage() << std::endl; exit(1); } std::string fileNameExtract = argv[1]; std::string fileNameLex = argv[2]; std::string fileNamePhraseTable = argv[3]; std::string fileNameCountOfCounts; std::string fileNameFunctionWords; std::vector featureArgs; // all unknown args passed to feature manager for(int i=4; iOpen(fileNamePhraseTable); if (!success) { std::cerr << "ERROR: could not open file phrase table file " << fileNamePhraseTable << std::endl; exit(1); } phraseTableFile = outputFile; } // loop through all extracted phrase translations char line[LINE_MAX_LENGTH], lastLine[LINE_MAX_LENGTH]; lastLine[0] = '\0'; ExtractionPhrasePair *phrasePair = NULL; std::vector< ExtractionPhrasePair* > phrasePairsWithSameSource; std::vector< ExtractionPhrasePair* > phrasePairsWithSameSourceAndTarget; // required for hierarchical rules only, as non-terminal alignments might make the phrases incompatible int tmpSentenceId; PHRASE *tmpPhraseSource, *tmpPhraseTarget; ALIGNMENT *tmpTargetToSourceAlignment; std::string tmpAdditionalPropertiesString; float tmpCount=0.0f, tmpPcfgSum=0.0f; int i=0; SAFE_GETLINE( (extractFileP), line, LINE_MAX_LENGTH, '\n', __FILE__ ); if ( !extractFileP.eof() ) { ++i; tmpPhraseSource = new PHRASE(); tmpPhraseTarget = new PHRASE(); tmpTargetToSourceAlignment = new ALIGNMENT(); processLine( std::string(line), i, featureManager.includeSentenceId(), tmpSentenceId, tmpPhraseSource, tmpPhraseTarget, tmpTargetToSourceAlignment, tmpAdditionalPropertiesString, tmpCount, tmpPcfgSum); phrasePair = new ExtractionPhrasePair( tmpPhraseSource, tmpPhraseTarget, tmpTargetToSourceAlignment, tmpCount, tmpPcfgSum ); phrasePair->AddProperties( tmpAdditionalPropertiesString, tmpCount ); featureManager.addPropertiesToPhrasePair( *phrasePair, tmpCount, tmpSentenceId ); phrasePairsWithSameSource.push_back( phrasePair ); if ( hierarchicalFlag ) { phrasePairsWithSameSourceAndTarget.push_back( phrasePair ); } strcpy( lastLine, line ); SAFE_GETLINE( (extractFileP), line, LINE_MAX_LENGTH, '\n', __FILE__ ); } while ( !extractFileP.eof() ) { if ( ++i % 100000 == 0 ) { std::cerr << "." << std::flush; } // identical to last line? just add count if (strcmp(line,lastLine) == 0) { phrasePair->IncrementPrevious(tmpCount,tmpPcfgSum); SAFE_GETLINE((extractFileP), line, LINE_MAX_LENGTH, '\n', __FILE__); continue; } else { strcpy( lastLine, line ); } tmpPhraseSource = new PHRASE(); tmpPhraseTarget = new PHRASE(); tmpTargetToSourceAlignment = new ALIGNMENT(); tmpAdditionalPropertiesString.clear(); processLine( std::string(line), i, featureManager.includeSentenceId(), tmpSentenceId, tmpPhraseSource, tmpPhraseTarget, tmpTargetToSourceAlignment, tmpAdditionalPropertiesString, tmpCount, tmpPcfgSum); bool matchesPrevious = false; bool sourceMatch = true; bool targetMatch = true; bool alignmentMatch = true; // be careful with these, // ExtractionPhrasePair::Matches() checks them in order and does not continue with the others // once the first of them has been found to have to be set to false if ( hierarchicalFlag ) { for ( std::vector< ExtractionPhrasePair* >::const_iterator iter = phrasePairsWithSameSourceAndTarget.begin(); iter != phrasePairsWithSameSourceAndTarget.end(); ++iter ) { if ( (*iter)->Matches( tmpPhraseSource, tmpPhraseTarget, tmpTargetToSourceAlignment, sourceMatch, targetMatch, alignmentMatch ) ) { matchesPrevious = true; phrasePair = (*iter); break; } } } else { if ( phrasePair->Matches( tmpPhraseSource, tmpPhraseTarget, tmpTargetToSourceAlignment, sourceMatch, targetMatch, alignmentMatch ) ) { matchesPrevious = true; } } if ( matchesPrevious ) { delete tmpPhraseSource; delete tmpPhraseTarget; if ( !phrasePair->Add( tmpTargetToSourceAlignment, tmpCount, tmpPcfgSum ) ) { delete tmpTargetToSourceAlignment; } phrasePair->AddProperties( tmpAdditionalPropertiesString, tmpCount ); featureManager.addPropertiesToPhrasePair( *phrasePair, tmpCount, tmpSentenceId ); } else { if ( !phrasePairsWithSameSource.empty() && !sourceMatch ) { processPhrasePairs( phrasePairsWithSameSource, *phraseTableFile, featureManager, maybeLogProb ); for ( std::vector< ExtractionPhrasePair* >::const_iterator iter=phrasePairsWithSameSource.begin(); iter!=phrasePairsWithSameSource.end(); ++iter) { delete *iter; } phrasePairsWithSameSource.clear(); if ( hierarchicalFlag ) { phrasePairsWithSameSourceAndTarget.clear(); } } if ( hierarchicalFlag ) { if ( !phrasePairsWithSameSourceAndTarget.empty() && !targetMatch ) { phrasePairsWithSameSourceAndTarget.clear(); } } phrasePair = new ExtractionPhrasePair( tmpPhraseSource, tmpPhraseTarget, tmpTargetToSourceAlignment, tmpCount, tmpPcfgSum ); phrasePair->AddProperties( tmpAdditionalPropertiesString, tmpCount ); featureManager.addPropertiesToPhrasePair( *phrasePair, tmpCount, tmpSentenceId ); phrasePairsWithSameSource.push_back(phrasePair); if ( hierarchicalFlag ) { phrasePairsWithSameSourceAndTarget.push_back(phrasePair); } } SAFE_GETLINE((extractFileP), line, LINE_MAX_LENGTH, '\n', __FILE__); } processPhrasePairs( phrasePairsWithSameSource, *phraseTableFile, featureManager, maybeLogProb ); for ( std::vector< ExtractionPhrasePair* >::const_iterator iter=phrasePairsWithSameSource.begin(); iter!=phrasePairsWithSameSource.end(); ++iter) { delete *iter; } phrasePairsWithSameSource.clear(); phraseTableFile->flush(); if (phraseTableFile != &std::cout) { delete phraseTableFile; } // output count of count statistics if (goodTuringFlag || kneserNeyFlag) { writeCountOfCounts( fileNameCountOfCounts ); } } void processLine( std::string line, int lineID, bool includeSentenceIdFlag, int &sentenceId, PHRASE *phraseSource, PHRASE *phraseTarget, ALIGNMENT *targetToSourceAlignment, std::string &additionalPropertiesString, float &count, float &pcfgSum ) { size_t foundAdditionalProperties = line.find("{{"); if (foundAdditionalProperties != std::string::npos) { additionalPropertiesString = line.substr(foundAdditionalProperties); line = line.substr(0,foundAdditionalProperties); } else { additionalPropertiesString.clear(); } phraseSource->clear(); phraseTarget->clear(); targetToSourceAlignment->clear(); std::vector token = tokenize( line.c_str() ); int item = 1; for ( size_t j=0; jpush_back( vcbS.storeIfNew( token[j] ) ); } else if (item == 2) { // target phrase phraseTarget->push_back( vcbT.storeIfNew( token[j] ) ); } else if (item == 3) { // alignment int s,t; sscanf(token[j].c_str(), "%d-%d", &s, &t); if ((size_t)t >= phraseTarget->size() || (size_t)s >= phraseSource->size()) { std::cerr << "WARNING: phrase pair " << lineID << " has alignment point (" << s << ", " << t << ")" << " out of bounds (" << phraseSource->size() << ", " << phraseTarget->size() << ")" << std::endl; } else { // first alignment point? -> initialize if ( targetToSourceAlignment->size() == 0 ) { size_t numberOfTargetSymbols = (hierarchicalFlag ? phraseTarget->size()-1 : phraseTarget->size()); targetToSourceAlignment->resize(numberOfTargetSymbols); } // add alignment point targetToSourceAlignment->at(t).insert(s); } } else if (includeSentenceIdFlag && item == 4) { // optional sentence id sscanf(token[j].c_str(), "%d", &sentenceId); } else if (item + (includeSentenceIdFlag?-1:0) == 4) { // count sscanf(token[j].c_str(), "%f", &count); } else if (item + (includeSentenceIdFlag?-1:0) == 5) { // target syntax PCFG score float pcfgScore = std::atof(token[j].c_str()); pcfgSum = pcfgScore * count; } } if ( targetToSourceAlignment->size() == 0 ) { size_t numberOfTargetSymbols = (hierarchicalFlag ? phraseTarget->size()-1 : phraseTarget->size()); targetToSourceAlignment->resize(numberOfTargetSymbols); } if (item + (includeSentenceIdFlag?-1:0) == 3) { count = 1.0; } if (item < 3 || item > 6) { std::cerr << "ERROR: faulty line " << lineID << ": " << line << endl; } } void writeCountOfCounts( const string &fileNameCountOfCounts ) { // open file Moses::OutputFileStream countOfCountsFile; bool success = countOfCountsFile.Open(fileNameCountOfCounts.c_str()); if (!success) { std::cerr << "ERROR: could not open count-of-counts file " << fileNameCountOfCounts << std::endl; return; } // Kneser-Ney needs the total number of phrase pairs countOfCountsFile << totalDistinct << std::endl; // write out counts for(int i=1; i<=COC_MAX; i++) { countOfCountsFile << countOfCounts[ i ] << std::endl; } countOfCountsFile.Close(); } void processPhrasePairs( std::vector< ExtractionPhrasePair* > &phrasePairsWithSameSource, ostream &phraseTableFile, const ScoreFeatureManager& featureManager, const MaybeLog& maybeLogProb ) { if (phrasePairsWithSameSource.size() == 0) { return; } float totalSource = 0; //std::cerr << "phrasePairs.size() = " << phrasePairs.size() << std::endl; // loop through phrase pairs for ( std::vector< ExtractionPhrasePair* >::const_iterator iter=phrasePairsWithSameSource.begin(); iter!=phrasePairsWithSameSource.end(); ++iter) { // add to total count totalSource += (*iter)->GetCount(); } // output the distinct phrase pairs, one at a time for ( std::vector< ExtractionPhrasePair* >::const_iterator iter=phrasePairsWithSameSource.begin(); iter!=phrasePairsWithSameSource.end(); ++iter) { // add to total count outputPhrasePair( **iter, totalSource, phrasePairsWithSameSource.size(), phraseTableFile, featureManager, maybeLogProb ); } } void outputPhrasePair(const ExtractionPhrasePair &phrasePair, float totalCount, int distinctCount, ostream &phraseTableFile, const ScoreFeatureManager& featureManager, const MaybeLog& maybeLogProb ) { assert(phrasePair.IsValid()); const ALIGNMENT *bestAlignmentT2S = phrasePair.FindBestAlignmentTargetToSource(); float count = phrasePair.GetCount(); map< string, float > domainCount; // collect count of count statistics if (goodTuringFlag || kneserNeyFlag) { totalDistinct++; int countInt = count + 0.99999; if (countInt <= COC_MAX) countOfCounts[ countInt ]++; } // compute PCFG score float pcfgScore = 0; if (pcfgFlag && !inverseFlag) { pcfgScore = phrasePair.GetPcfgScore() / count; } // output phrases const PHRASE *phraseSource = phrasePair.GetSource(); const PHRASE *phraseTarget = phrasePair.GetTarget(); // do not output if hierarchical and count below threshold if (hierarchicalFlag && count < minCountHierarchical) { for(size_t j=0; jsize()-1; ++j) { if (isNonTerminal(vcbS.getWord( phraseSource->at(j) ))) return; } } // source phrase (unless inverse) if (!inverseFlag) { printSourcePhrase(phraseSource, phraseTarget, bestAlignmentT2S, phraseTableFile); phraseTableFile << " ||| "; } // target phrase printTargetPhrase(phraseSource, phraseTarget, bestAlignmentT2S, phraseTableFile); phraseTableFile << " ||| "; // source phrase (if inverse) if (inverseFlag) { printSourcePhrase(phraseSource, phraseTarget, bestAlignmentT2S, phraseTableFile); phraseTableFile << " ||| "; } // alignment if ( hierarchicalFlag ) { // always output alignment if hiero style assert(phraseTarget->size() == bestAlignmentT2S->size()+1); std::vector alignment; for ( size_t j = 0; j < phraseTarget->size() - 1; ++j ) { if ( isNonTerminal(vcbT.getWord( phraseTarget->at(j) ))) { if ( bestAlignmentT2S->at(j).size() != 1 ) { std::cerr << "Error: unequal numbers of non-terminals. Make sure the text does not contain words in square brackets (like [xxx])." << std::endl; phraseTableFile.flush(); assert(bestAlignmentT2S->at(j).size() == 1); } size_t sourcePos = *(bestAlignmentT2S->at(j).begin()); //phraseTableFile << sourcePos << "-" << j << " "; std::stringstream point; point << sourcePos << "-" << j; alignment.push_back(point.str()); } else { for ( std::set::iterator setIter = (bestAlignmentT2S->at(j)).begin(); setIter != (bestAlignmentT2S->at(j)).end(); ++setIter ) { size_t sourcePos = *setIter; std::stringstream point; point << sourcePos << "-" << j; alignment.push_back(point.str()); } } } // now print all alignments, sorted by source index sort(alignment.begin(), alignment.end()); for (size_t i = 0; i < alignment.size(); ++i) { phraseTableFile << alignment[i] << " "; } } else if ( !inverseFlag && wordAlignmentFlag) { // alignment info in pb model for (size_t j = 0; j < bestAlignmentT2S->size(); ++j) { for ( std::set::iterator setIter = (bestAlignmentT2S->at(j)).begin(); setIter != (bestAlignmentT2S->at(j)).end(); ++setIter ) { size_t sourcePos = *setIter; phraseTableFile << sourcePos << "-" << j << " "; } } } phraseTableFile << " ||| "; // lexical translation probability if (lexFlag) { double lexScore = computeLexicalTranslation( phraseSource, phraseTarget, bestAlignmentT2S ); phraseTableFile << maybeLogProb( lexScore ); } // unaligned word penalty if (unalignedFlag) { double penalty = computeUnalignedPenalty( bestAlignmentT2S ); phraseTableFile << " " << maybeLogProb( penalty ); } // unaligned function word penalty if (unalignedFWFlag) { double penalty = computeUnalignedFWPenalty( phraseTarget, bestAlignmentT2S ); phraseTableFile << " " << maybeLogProb( penalty ); } if (crossedNonTerm && !inverseFlag) { phraseTableFile << " " << calcCrossedNonTerm( phraseTarget, bestAlignmentT2S ); } // target-side PCFG score if (pcfgFlag && !inverseFlag) { phraseTableFile << " " << maybeLogProb( pcfgScore ); } // extra features ScoreFeatureContext context(phrasePair, maybeLogProb); std::vector extraDense; map extraSparse; featureManager.addFeatures(context, extraDense, extraSparse); for (size_t i = 0; i < extraDense.size(); ++i) { phraseTableFile << " " << extraDense[i]; } for (map::const_iterator i = extraSparse.begin(); i != extraSparse.end(); ++i) { phraseTableFile << " " << i->first << " " << i->second; } // counts phraseTableFile << " ||| " << totalCount << " " << count; if (kneserNeyFlag) phraseTableFile << " " << distinctCount; if ((treeFragmentsFlag) && !inverseFlag) { phraseTableFile << " |||"; } // tree fragments if (treeFragmentsFlag && !inverseFlag) { const std::string *bestTreeFragment = phrasePair.FindBestPropertyValue("Tree"); if (bestTreeFragment) { phraseTableFile << " {{Tree " << *bestTreeFragment << "}}"; } } phraseTableFile << std::endl; } bool calcCrossedNonTerm( size_t targetPos, size_t sourcePos, const ALIGNMENT *alignmentTargetToSource ) { for (size_t currTarget = 0; currTarget < alignmentTargetToSource->size(); ++currTarget) { if (currTarget == targetPos) { // skip } else { const std::set &sourceSet = alignmentTargetToSource->at(currTarget); for (std::set::const_iterator iter = sourceSet.begin(); iter != sourceSet.end(); ++iter) { size_t currSource = *iter; if ((currTarget < targetPos && currSource > sourcePos) || (currTarget > targetPos && currSource < sourcePos) ) { return true; } } } } return false; } int calcCrossedNonTerm( const PHRASE *phraseTarget, const ALIGNMENT *alignmentTargetToSource ) { assert(phraseTarget->size() >= alignmentTargetToSource->size() ); for (size_t targetPos = 0; targetPos < alignmentTargetToSource->size(); ++targetPos) { if ( isNonTerminal(vcbT.getWord( phraseTarget->at(targetPos) ))) { const std::set &alignmentPoints = alignmentTargetToSource->at(targetPos); assert( alignmentPoints.size() == 1 ); size_t sourcePos = *alignmentPoints.begin(); bool ret = calcCrossedNonTerm(targetPos, sourcePos, alignmentTargetToSource); if (ret) return 1; } } return 0; } double computeUnalignedPenalty( const ALIGNMENT *alignmentTargetToSource ) { // unaligned word counter double unaligned = 1.0; // only checking target words - source words are caught when computing inverse for(size_t ti=0; tisize(); ++ti) { const set< size_t > & srcIndices = alignmentTargetToSource->at(ti); if (srcIndices.empty()) { unaligned *= 2.718; } } return unaligned; } double computeUnalignedFWPenalty( const PHRASE *phraseTarget, const ALIGNMENT *alignmentTargetToSource ) { // unaligned word counter double unaligned = 1.0; // only checking target words - source words are caught when computing inverse for(size_t ti=0; tisize(); ++ti) { const set< size_t > & srcIndices = alignmentTargetToSource->at(ti); if (srcIndices.empty() && functionWordList.find( vcbT.getWord( phraseTarget->at(ti) ) ) != functionWordList.end()) { unaligned *= 2.718; } } return unaligned; } void loadFunctionWords( const string &fileName ) { std::cerr << "Loading function word list from " << fileName; ifstream inFile; inFile.open(fileName.c_str()); if (inFile.fail()) { std::cerr << " - ERROR: could not open file" << std::endl; exit(1); } istream *inFileP = &inFile; char line[LINE_MAX_LENGTH]; while(true) { SAFE_GETLINE((*inFileP), line, LINE_MAX_LENGTH, '\n', __FILE__); if (inFileP->eof()) break; std::vector token = tokenize( line ); if (token.size() > 0) functionWordList.insert( token[0] ); } std::cerr << " - read " << functionWordList.size() << " function words" << std::endl; inFile.close(); } double computeLexicalTranslation( const PHRASE *phraseSource, const PHRASE *phraseTarget, const ALIGNMENT *alignmentTargetToSource ) { // lexical translation probability double lexScore = 1.0; int null = vcbS.getWordID("NULL"); // all target words have to be explained for(size_t ti=0; tisize(); ti++) { const set< size_t > & srcIndices = alignmentTargetToSource->at(ti); if (srcIndices.empty()) { // explain unaligned word by NULL lexScore *= lexTable.permissiveLookup( null, phraseTarget->at(ti) ); } else { // go through all the aligned words to compute average double thisWordScore = 0; for (set< size_t >::const_iterator p(srcIndices.begin()); p != srcIndices.end(); ++p) { thisWordScore += lexTable.permissiveLookup( phraseSource->at(*p), phraseTarget->at(ti) ); } lexScore *= thisWordScore / (double)srcIndices.size(); } } return lexScore; } void LexicalTable::load( const string &fileName ) { std::cerr << "Loading lexical translation table from " << fileName; ifstream inFile; inFile.open(fileName.c_str()); if (inFile.fail()) { std::cerr << " - ERROR: could not open file" << std::endl; exit(1); } istream *inFileP = &inFile; char line[LINE_MAX_LENGTH]; int i=0; while(true) { i++; if (i%100000 == 0) std::cerr << "." << flush; SAFE_GETLINE((*inFileP), line, LINE_MAX_LENGTH, '\n', __FILE__); if (inFileP->eof()) break; std::vector token = tokenize( line ); if (token.size() != 3) { std::cerr << "line " << i << " in " << fileName << " has wrong number of tokens, skipping:" << std::endl << token.size() << " " << token[0] << " " << line << std::endl; continue; } double prob = atof( token[2].c_str() ); WORD_ID wordT = vcbT.storeIfNew( token[0] ); WORD_ID wordS = vcbS.storeIfNew( token[1] ); ltable[ wordS ][ wordT ] = prob; } std::cerr << std::endl; } void printSourcePhrase(const PHRASE *phraseSource, const PHRASE *phraseTarget, const ALIGNMENT *targetToSourceAlignment, ostream &out) { // get corresponding target non-terminal and output pair ALIGNMENT *sourceToTargetAlignment = new ALIGNMENT(); invertAlignment(phraseSource, phraseTarget, targetToSourceAlignment, sourceToTargetAlignment); // output source symbols, except root, in rule table format for (std::size_t i = 0; i < phraseSource->size()-1; ++i) { const std::string &word = vcbS.getWord(phraseSource->at(i)); if (!unpairedExtractFormatFlag || !isNonTerminal(word)) { out << word << " "; continue; } const std::set &alignmentPoints = sourceToTargetAlignment->at(i); assert(alignmentPoints.size() == 1); size_t j = *(alignmentPoints.begin()); if (inverseFlag) { out << vcbT.getWord(phraseTarget->at(j)) << word << " "; } else { out << word << vcbT.getWord(phraseTarget->at(j)) << " "; } } // output source root symbol if (conditionOnTargetLhsFlag && !inverseFlag) { out << "[X]"; } else { out << vcbS.getWord(phraseSource->back()); } delete sourceToTargetAlignment; } void printTargetPhrase(const PHRASE *phraseSource, const PHRASE *phraseTarget, const ALIGNMENT *targetToSourceAlignment, ostream &out) { // output target symbols, except root, in rule table format for (std::size_t i = 0; i < phraseTarget->size()-1; ++i) { const std::string &word = vcbT.getWord(phraseTarget->at(i)); if (!unpairedExtractFormatFlag || !isNonTerminal(word)) { out << word << " "; continue; } // get corresponding source non-terminal and output pair std::set alignmentPoints = targetToSourceAlignment->at(i); assert(alignmentPoints.size() == 1); int j = *(alignmentPoints.begin()); if (inverseFlag) { out << word << vcbS.getWord(phraseSource->at(j)) << " "; } else { out << vcbS.getWord(phraseSource->at(j)) << word << " "; } } // output target root symbol if (conditionOnTargetLhsFlag) { if (inverseFlag) { out << "[X]"; } else { out << vcbS.getWord(phraseSource->back()); } } else { out << vcbT.getWord(phraseTarget->back()); } } void invertAlignment(const PHRASE *phraseSource, const PHRASE *phraseTarget, const ALIGNMENT *inTargetToSourceAlignment, ALIGNMENT *outSourceToTargetAlignment) { // typedef std::vector< std::set > ALIGNMENT; outSourceToTargetAlignment->clear(); size_t numberOfSourceSymbols = (hierarchicalFlag ? phraseSource->size()-1 : phraseSource->size()); outSourceToTargetAlignment->resize(numberOfSourceSymbols); // add alignment point for (size_t targetPosition = 0; targetPosition < inTargetToSourceAlignment->size(); ++targetPosition) { for ( std::set::iterator setIter = (inTargetToSourceAlignment->at(targetPosition)).begin(); setIter != (inTargetToSourceAlignment->at(targetPosition)).end(); ++setIter ) { size_t sourcePosition = *setIter; outSourceToTargetAlignment->at(sourcePosition).insert(targetPosition); } } }