/** * Common lossy counting phrase extraction functionality implementation. * * Note: The bulk of this unit is based on Philipp Koehn's code from * phrase-extract/extract.cpp. * * (C) Moses: http://www.statmt.org/moses/ * (C) Ceslav Przywara, UFAL MFF UK, 2011 * * $Id$ */ #include #include #include #include "phrase-extract.h" #include "ISS.h" // I'm using my own version of SafeGetline (without "using namespace std;"): #include "SafeGetline.h" #define LINE_MAX_LENGTH 60000 //////// Helping functions //////// // For sorted output. typedef std::pair output_pair_t; typedef std::vector output_vector_t; class PhraseComp { /** @var If true, sort by target phrase first. */ bool _inverted; bool compareAlignments(const indexed_phrases_pair_t& a, const indexed_phrases_pair_t& b); int comparePhrases(const indexed_phrases_pair_t::phrase_t& a, const indexed_phrases_pair_t::phrase_t& b); public: PhraseComp(bool inverted): _inverted(inverted) {} bool operator()(const output_pair_t& a, const output_pair_t& b); }; void processSortedOutput(OutputProcessor& processor); void processUnsortedOutput(OutputProcessor& processor); void flushPhrasePair(OutputProcessor& processor, const indexed_phrases_pair_t& indexedPhrasePair, PhrasePairsLossyCounter::frequency_t frequency, int mode); //////// Define variables declared as extern in the header ///////////////////// bool allModelsOutputFlag = false; bool wordModel = false; // IBM word model. REO_MODEL_TYPE wordType = REO_MSD; bool phraseModel = false; // Std phrase-based model. REO_MODEL_TYPE phraseType = REO_MSD; bool hierModel = false; // Hierarchical model. REO_MODEL_TYPE hierType = REO_MSD; int maxPhraseLength = 0; // Eg. 7 bool translationFlag = true; // Generate extract and extract.inv bool orientationFlag = false; // Ordering info needed? bool sortedOutput = false; // Sort output? LossyCountersVector lossyCounters; #ifdef GET_COUNTS_ONLY std::vector phrasePairsCounters; #endif //////// Internal module variables ///////////////////////////////////////////// IndexedStringsStorage strings; IndexedStringsStorage orientations; //////// Untouched Philipp Koehn's code :) ///////////////////////////////////// REO_POS getOrientWordModel(SentenceAlignment & sentence, REO_MODEL_TYPE modelType, bool connectedLeftTop, bool connectedRightTop, int startF, int endF, int startE, int endE, int countF, int zero, int unit, bool (*ge)(int, int), bool (*lt)(int, int) ) { if( connectedLeftTop && !connectedRightTop) return LEFT; if(modelType == REO_MONO) return UNKNOWN; if (!connectedLeftTop && connectedRightTop) return RIGHT; if(modelType == REO_MSD) return UNKNOWN; for(int indexF=startF-2*unit; (*ge)(indexF, zero) && !connectedLeftTop; indexF=indexF-unit) connectedLeftTop = isAligned(sentence, indexF, startE-unit); for(int indexF=endF+2*unit; (*lt)(indexF,countF) && !connectedRightTop; indexF=indexF+unit) connectedRightTop = isAligned(sentence, indexF, startE-unit); if(connectedLeftTop && !connectedRightTop) return DRIGHT; else if(!connectedLeftTop && connectedRightTop) return DLEFT; return UNKNOWN; } // to be called with countF-1 instead of countF REO_POS getOrientPhraseModel (SentenceAlignment & sentence, REO_MODEL_TYPE modelType, bool connectedLeftTop, bool connectedRightTop, int startF, int endF, int startE, int endE, int countF, int zero, int unit, bool (*ge)(int, int), bool (*lt)(int, int), const HSentenceVertices & inBottomRight, const HSentenceVertices & inBottomLeft) { HSentenceVertices::const_iterator it; if((connectedLeftTop && !connectedRightTop) || //(startE == 0 && startF == 0) || //(startE == sentence.target.size()-1 && startF == sentence.source.size()-1) || ((it = inBottomRight.find(startE - unit)) != inBottomRight.end() && it->second.find(startF-unit) != it->second.end())) return LEFT; if(modelType == REO_MONO) return UNKNOWN; if((!connectedLeftTop && connectedRightTop) || ((it = inBottomLeft.find(startE - unit)) != inBottomLeft.end() && it->second.find(endF + unit) != it->second.end())) return RIGHT; if(modelType == REO_MSD) return UNKNOWN; connectedLeftTop = false; for(int indexF=startF-2*unit; (*ge)(indexF, zero) && !connectedLeftTop; indexF=indexF-unit) if(connectedLeftTop = (it = inBottomRight.find(startE - unit)) != inBottomRight.end() && it->second.find(indexF) != it->second.end()) return DRIGHT; connectedRightTop = false; for(int indexF=endF+2*unit; (*lt)(indexF, countF) && !connectedRightTop; indexF=indexF+unit) if(connectedRightTop = (it = inBottomLeft.find(startE - unit)) != inBottomRight.end() && it->second.find(indexF) != it->second.end()) return DLEFT; return UNKNOWN; } // to be called with countF-1 instead of countF REO_POS getOrientHierModel (SentenceAlignment & sentence, REO_MODEL_TYPE modelType, bool connectedLeftTop, bool connectedRightTop, int startF, int endF, int startE, int endE, int countF, int zero, int unit, bool (*ge)(int, int), bool (*lt)(int, int), const HSentenceVertices & inBottomRight, const HSentenceVertices & inBottomLeft, const HSentenceVertices & outBottomRight, const HSentenceVertices & outBottomLeft, REO_POS phraseOrient) { HSentenceVertices::const_iterator it; if(phraseOrient == LEFT || (connectedLeftTop && !connectedRightTop) || // (startE == 0 && startF == 0) || //(startE == sentence.target.size()-1 && startF == sentence.source.size()-1) || ((it = inBottomRight.find(startE - unit)) != inBottomRight.end() && it->second.find(startF-unit) != it->second.end()) || ((it = outBottomRight.find(startE - unit)) != outBottomRight.end() && it->second.find(startF-unit) != it->second.end())) return LEFT; if(modelType == REO_MONO) return UNKNOWN; if(phraseOrient == RIGHT || (!connectedLeftTop && connectedRightTop) || ((it = inBottomLeft.find(startE - unit)) != inBottomLeft.end() && it->second.find(endF + unit) != it->second.end()) || ((it = outBottomLeft.find(startE - unit)) != outBottomLeft.end() && it->second.find(endF + unit) != it->second.end())) return RIGHT; if(modelType == REO_MSD) return UNKNOWN; if(phraseOrient != UNKNOWN) return phraseOrient; connectedLeftTop = false; for(int indexF=startF-2*unit; (*ge)(indexF, zero) && !connectedLeftTop; indexF=indexF-unit) { if((connectedLeftTop = (it = inBottomRight.find(startE - unit)) != inBottomRight.end() && it->second.find(indexF) != it->second.end()) || (connectedLeftTop = (it = outBottomRight.find(startE - unit)) != outBottomRight.end() && it->second.find(indexF) != it->second.end())) return DRIGHT; } connectedRightTop = false; for(int indexF=endF+2*unit; (*lt)(indexF, countF) && !connectedRightTop; indexF=indexF+unit) { if((connectedRightTop = (it = inBottomLeft.find(startE - unit)) != inBottomRight.end() && it->second.find(indexF) != it->second.end()) || (connectedRightTop = (it = outBottomLeft.find(startE - unit)) != outBottomRight.end() && it->second.find(indexF) != it->second.end())) return DLEFT; } return UNKNOWN; } void insertVertex( HSentenceVertices & corners, int x, int y ) { std::set tmp; tmp.insert(x); std::pair< HSentenceVertices::iterator, bool > ret = corners.insert( std::pair > (y, tmp) ); if(ret.second == false) { ret.first->second.insert(x); } } void insertPhraseVertices( HSentenceVertices & topLeft, HSentenceVertices & topRight, HSentenceVertices & bottomLeft, HSentenceVertices & bottomRight, int startF, int startE, int endF, int endE) { insertVertex(topLeft, startF, startE); insertVertex(topRight, endF, startE); insertVertex(bottomLeft, startF, endE); insertVertex(bottomRight, endF, endE); } std::string getOrientString(REO_POS orient, REO_MODEL_TYPE modelType) { switch(orient) { case LEFT: return "mono"; break; case RIGHT: return "swap"; break; case DRIGHT: return "dright"; break; case DLEFT: return "dleft"; break; case UNKNOWN: switch(modelType) { case REO_MONO: return "nomono"; break; case REO_MSD: return "other"; break; case REO_MSLR: return "dright"; break; } break; } } bool ge(int first, int second) { return first >= second; } bool le(int first, int second) { return first <= second; } bool lt(int first, int second) { return first < second; } bool isAligned ( SentenceAlignment &sentence, int fi, int ei ) { if (ei == -1 && fi == -1) return true; if (ei <= -1 || fi <= -1) return false; if (ei == sentence.target.size() && fi == sentence.source.size()) return true; if (ei >= sentence.target.size() || fi >= sentence.source.size()) return false; for(int i=0; i usedF = sentence.alignedCountS; for (int ei = startE; ei <= endE; ei++) { for (int i = 0; i < sentence.alignedToT[ei].size(); i++) { int fi = sentence.alignedToT[ei][i]; if (fi < minF) { minF = fi; } if (fi > maxF) { maxF = fi; } usedF[ fi ]--; } } if (maxF >= 0 && // aligned to any source words at all (relaxLimit || maxF-minF < maxPhraseLength)) { // source phrase within limits // check if source words are aligned to out of bound target words bool out_of_bounds = false; for (int fi=minF; fi<=maxF && !out_of_bounds; fi++) { if (usedF[fi]>0) { // cout << "ouf of bounds: " << fi << "\n"; out_of_bounds = true; } } // cout << "doing if for ( " << minF << "-" << maxF << ", " << startE << "," << endE << ")\n"; if (!out_of_bounds) { // start point of source phrase may retreat over unaligned for (int startF=minF; (startF>=0 && (relaxLimit || startF>maxF-maxPhraseLength) && // within length limit (startF==minF || sentence.alignedCountS[startF]==0)); // unaligned startF-- ) // end point of source phrase may advance over unaligned for (int endF=maxF; (endFlossyCounter.add(indexed_phrases_pair_t(srcPhraseIndices, tgtPhraseIndices, orientations.put(orientationInfo.c_str()), alignment)); // if ( lossyCounters[idx]->lossyCounter.aboutToPrune() ) { // Next addition will lead to pruning, inform: std::cerr << 'P' << idx << std::flush; } #endif } // end of addPhrase() /////// Lossy Counting related code //////////////////////////////////////////// void readInput(std::istream& eFile, std::istream& fFile, std::istream& aFile) { // Note: moved out of the loop. char englishString[LINE_MAX_LENGTH]; char foreignString[LINE_MAX_LENGTH]; char alignmentString[LINE_MAX_LENGTH]; int i = 0; while(true) { // Report progress? if (++i%10000 == 0) std::cerr << "." << std::flush; SAFE_GETLINE(eFile, englishString, LINE_MAX_LENGTH, '\n', __FILE__); if (eFile.eof()) break; SAFE_GETLINE(fFile, foreignString, LINE_MAX_LENGTH, '\n', __FILE__); SAFE_GETLINE(aFile, alignmentString, LINE_MAX_LENGTH, '\n', __FILE__); SentenceAlignment sentence; if (sentence.create(englishString, foreignString, alignmentString, i)) { extract(sentence); } } } void processOutput(OutputProcessor& processor) { if ( sortedOutput ) { processSortedOutput(processor); } else { processUnsortedOutput(processor); } } bool PhraseComp::operator()(const output_pair_t& a, const output_pair_t& b) { int cmp = _inverted ? comparePhrases(a.first.tgtPhrase(), b.first.tgtPhrase()) : comparePhrases(a.first.srcPhrase(), b.first.srcPhrase()); if ( cmp == 0 ) { // First part of pairs matches, compare the second part. cmp = _inverted ? comparePhrases(a.first.srcPhrase(), b.first.srcPhrase()) : comparePhrases(a.first.tgtPhrase(), b.first.tgtPhrase()); if ( cmp == 0 ) { // Also second part matches, compare alignments. return compareAlignments(a.first, b.first); } else { return cmp < 0; } } else { return cmp < 0; } } bool PhraseComp::compareAlignments(const indexed_phrases_pair_t& a, const indexed_phrases_pair_t& b) { size_t aSize = a.alignmentLength(); size_t bSize = b.alignmentLength(); size_t min = std::min(aSize, bSize); const indexed_phrases_pair_t::alignment_point_t * aAlignment = a.alignmentData(); const indexed_phrases_pair_t::alignment_point_t * bAlignment = b.alignmentData(); int cmp = 0; for ( size_t i = 0; i < min; ++i ) { // Important: alignments have to be eventually inverted as well! if ( _inverted ) { // Inverted = compare TGT phrase alignment points first. cmp = memcmp(aAlignment + i*2 + 1, bAlignment + i*2 + 1, sizeof(indexed_phrases_pair_t::alignment_point_t)); } else{ // NOT inverted = compare SRC phrase alignment points first. cmp = memcmp(aAlignment+ i*2, bAlignment + i*2, sizeof(indexed_phrases_pair_t::alignment_point_t)); } if ( cmp == 0 ) { if ( _inverted ) { // Inverted = compare SRC phrase alignment points second. cmp = memcmp(aAlignment + i*2, bAlignment + i*2, sizeof(indexed_phrases_pair_t::alignment_point_t)); } else{ // NOT inverted = compare TGT phrase alignment points second. cmp = memcmp(aAlignment + i*2 + 1, bAlignment + i*2 + 1, sizeof(indexed_phrases_pair_t::alignment_point_t)); } if ( cmp != 0 ) { return cmp < 0; } // Otherwise continue looping. } else { return cmp < 0; } } // Note: LC_ALL=C GNU sort treats shorter item as lesser than longer one. return (cmp == 0) ? (aSize < bSize) : (cmp < 0); } int PhraseComp::comparePhrases(const indexed_phrases_pair_t::phrase_t& a, const indexed_phrases_pair_t::phrase_t& b) { size_t aSize = a.size(); size_t bSize = b.size(); size_t min = std::min(aSize, bSize); int cmp = 0; for ( size_t i = 0; i < min; ++i ) { cmp = strcmp(strings.get(a[i]), strings.get(b[i])); if ( cmp != 0 ) { return cmp; } } if ( aSize == bSize ) { return 0; } if ( aSize < bSize ) { return strcmp("|||", strings.get(b[min])); } else { return strcmp(strings.get(a[min]), "|||"); } } void processSortedOutput(OutputProcessor& processor) { output_vector_t output; LossyCountersVector::value_type current = NULL, prev = NULL; for ( size_t i = 1; i < lossyCounters.size(); ++i ) { // Intentionally skip 0. current = lossyCounters[i]; if ( current != prev ) { PhrasePairsLossyCounter& lossyCounter = current->lossyCounter; for ( PhrasePairsLossyCounter::erasing_iterator phraseIter = lossyCounter.beginErase(); phraseIter != lossyCounter.endErase(); ++phraseIter ) { // Store and... output.push_back(std::make_pair(phraseIter.item(), phraseIter.frequency())); // ...update counters. current->outputMass += phraseIter.frequency(); current->outputSize += 1; } // prev = current; //delete current; } } // Sort by source phrase. std::sort(output.begin(), output.end(), PhraseComp(false)); // Print. for ( output_vector_t::const_iterator iter = output.begin(); iter != output.end(); ++iter ) { flushPhrasePair(processor, iter->first, iter->second, 1); } // Sort by target phrase. std::sort(output.begin(), output.end(), PhraseComp(true)); // Print. for ( output_vector_t::const_iterator iter = output.begin(); iter != output.end(); ++iter ) { flushPhrasePair(processor, iter->first, iter->second, -1); } } void processUnsortedOutput(OutputProcessor& processor) { LossyCountersVector::value_type current = NULL, prev = NULL; for ( size_t i = 1; i < lossyCounters.size(); ++i ) { // Intentionally skip 0. current = lossyCounters[i]; if ( current != prev ) { const PhrasePairsLossyCounter& lossyCounter = current->lossyCounter; for ( PhrasePairsLossyCounter::const_iterator phraseIter = lossyCounter.begin(); phraseIter != lossyCounter.end(); ++phraseIter ) { // Flush and... flushPhrasePair(processor, phraseIter.item(), phraseIter.frequency(), 0); // ...update counters. current->outputMass += phraseIter.frequency(); current->outputSize += 1; } // prev = current; } } } void flushPhrasePair(OutputProcessor& processor, const indexed_phrases_pair_t& indexedPhrasePair, PhrasePairsLossyCounter::frequency_t frequency, int mode = 0) { const indexed_phrases_pair_t::phrase_t srcPhraseIndices = indexedPhrasePair.srcPhrase(); const indexed_phrases_pair_t::phrase_t tgtPhraseIndices = indexedPhrasePair.tgtPhrase(); std::string srcPhrase, tgtPhrase; for ( indexed_phrases_pair_t::phrase_t::const_iterator indexIter = srcPhraseIndices.begin(); indexIter != srcPhraseIndices.end(); ++indexIter ) { srcPhrase += std::string(strings.get(*indexIter)) + " "; } srcPhrase.resize(srcPhrase.size() - 1); // Trim the trailing " " for ( indexed_phrases_pair_t::phrase_t::const_iterator indexIter = tgtPhraseIndices.begin(); indexIter != tgtPhraseIndices.end(); ++indexIter ) { tgtPhrase += std::string(strings.get(*indexIter)) + " "; } tgtPhrase.resize(tgtPhrase.size() - 1); // Trim the trailing " " // Actual processing is done via call to functor: processor(srcPhrase, tgtPhrase, orientations.get(indexedPhrasePair.orientationInfo()), indexedPhrasePair.alignment(), frequency, mode); } void printStats(void) { // Total counters. size_t outputMass = 0, outputSize = 0, N = 0; const std::string hline = "####################################################################################################################"; std::cerr << "Lossy Counting Phrase Extraction statistics:" << std::endl; // Print header: | 3 | 15 | 15 | 15 | 7 | 10 | 10 | 10 | std::cerr << hline << std::endl << "# length # unique out # total out # total in (N) # out/in (%) # pos. thr. # neg. thr. # max. err. #" << std::endl << hline << std::endl; LossyCountersVector::value_type current = NULL, prev = NULL; size_t from = 1, to = 1; for ( size_t i = 1; i <= lossyCounters.size(); ++i ) { // Intentionally skip 0, intentionally increment till == size(). current = (i < lossyCounters.size()) ? lossyCounters[i] : NULL; if ( (current == NULL) || ((current != prev) && (prev != NULL)) ) { // Time to print. to = i-1; // Increment overall stats. outputMass += prev->outputMass; outputSize += prev->outputSize; N += prev->lossyCounter.count(); // Print. if ( from == to ) { std::cerr << "# " << std::setw(6) << to << " # "; } else { std::stringstream strStr; strStr << from << "-" << to; std::cerr << "# " << std::setw(6) << strStr.str() << " # "; } // Print the rest of record. std::cerr << std::setw(15) << prev->outputSize << " # " << std::setw(15) << prev->outputMass << " # " << std::setw(15) << prev->lossyCounter.count() << " # " << std::setw(10) << std::setprecision(4) << (static_cast(prev->outputMass) / static_cast(prev->lossyCounter.count())) * 100 << " # " << std::setw(10) << prev->lossyCounter.threshold(true) << " # " << std::setw(10) << prev->lossyCounter.threshold() << " # " << std::setw(10) << prev->lossyCounter.maxError() << " #" << std::endl << hline << std::endl; from = i; } prev = current; } // Print summary: std::cerr << "# TOTAL # " << std::setw(15) << outputSize << " # " << std::setw(15) << outputMass << " # " << std::setw(15) << N << " # " << std::setw(10) << std::setprecision(4) << (static_cast(outputMass) / static_cast(N)) * 100 << " #" << std::endl << "#############################################################################" << std::endl; }