// $Id$ // vim:tabstop=2 /*********************************************************************** Moses - factored phrase-based language decoder Copyright (C) 2006 University of Edinburgh This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ***********************************************************************/ #include #include "PhraseTableCreator.h" #include "ConsistentPhrases.h" #include "ThrowingFwrite.h" #include "util/file.hh" #include "util/exception.hh" namespace Moses { bool operator<(const PackedItem &pi1, const PackedItem &pi2) { if(pi1.GetLine() < pi2.GetLine()) return false; return true; } std::string PhraseTableCreator::m_phraseStopSymbol = "__SPECIAL_STOP_SYMBOL__"; std::string PhraseTableCreator::m_separator = "|||"; PhraseTableCreator::PhraseTableCreator(std::string inPath, std::string outPath, std::string tempfilePath, size_t numScoreComponent, size_t sortScoreIndex, Coding coding, size_t orderBits, size_t fingerPrintBits, bool useAlignmentInfo, bool multipleScoreTrees, size_t quantize, size_t maxRank, bool warnMe #ifdef WITH_THREADS , size_t threads #endif ) : m_inPath(inPath), m_outPath(outPath), m_tempfilePath(tempfilePath), m_outFile(std::fopen(m_outPath.c_str(), "w")), m_numScoreComponent(numScoreComponent), m_sortScoreIndex(sortScoreIndex), m_warnMe(warnMe), m_coding(coding), m_orderBits(orderBits), m_fingerPrintBits(fingerPrintBits), m_useAlignmentInfo(useAlignmentInfo), m_multipleScoreTrees(multipleScoreTrees), m_quantize(quantize), m_maxRank(maxRank), #ifdef WITH_THREADS m_threads(threads), m_srcHash(m_orderBits, m_fingerPrintBits, 1), m_rnkHash(10, 24, m_threads), #else m_srcHash(m_orderBits, m_fingerPrintBits), m_rnkHash(m_orderBits, m_fingerPrintBits), #endif m_maxPhraseLength(0), m_lastFlushedLine(-1), m_lastFlushedSourceNum(0), m_lastFlushedSourcePhrase("") { PrintInfo(); AddTargetSymbolId(m_phraseStopSymbol); size_t cur_pass = 1; size_t all_passes = 2; if(m_coding == PREnc) all_passes = 3; m_scoreCounters.resize(m_multipleScoreTrees ? m_numScoreComponent : 1); for(std::vector::iterator it = m_scoreCounters.begin(); it != m_scoreCounters.end(); it++) *it = new ScoreCounter(); m_scoreTrees.resize(m_multipleScoreTrees ? m_numScoreComponent : 1); // 0th pass if(m_coding == REnc) { size_t found = inPath.find_last_of("/\\"); std::string path; if(found != std::string::npos) path = inPath.substr(0, found); else path = "."; LoadLexicalTable(path + "/lex.f2e"); } else if(m_coding == PREnc) { std::cerr << "Pass " << cur_pass << "/" << all_passes << ": Creating hash function for rank assignment" << std::endl; cur_pass++; CreateRankHash(); } // 1st pass std::cerr << "Pass " << cur_pass << "/" << all_passes << ": Creating source phrase index + Encoding target phrases" << std::endl; m_srcHash.BeginSave(m_outFile); if(tempfilePath.size()) { MmapAllocator allocEncoded(util::FMakeTemp(tempfilePath)); m_encodedTargetPhrases = new StringVectorTemp(allocEncoded); } else { m_encodedTargetPhrases = new StringVectorTemp(); } EncodeTargetPhrases(); cur_pass++; std::cerr << "Intermezzo: Calculating Huffman code sets" << std::endl; CalcHuffmanCodes(); // 2nd pass std::cerr << "Pass " << cur_pass << "/" << all_passes << ": Compressing target phrases" << std::endl; if(tempfilePath.size()) { MmapAllocator allocCompressed(util::FMakeTemp(tempfilePath)); m_compressedTargetPhrases = new StringVector(allocCompressed); } else { m_compressedTargetPhrases = new StringVector(); } CompressTargetPhrases(); std::cerr << "Saving to " << m_outPath << std::endl; Save(); std::cerr << "Done" << std::endl; std::fclose(m_outFile); } PhraseTableCreator::~PhraseTableCreator() { delete m_symbolTree; if(m_useAlignmentInfo) delete m_alignTree; for(size_t i = 0; i < m_scoreTrees.size(); i++) { delete m_scoreTrees[i]; delete m_scoreCounters[i]; } delete m_encodedTargetPhrases; delete m_compressedTargetPhrases; } void PhraseTableCreator::PrintInfo() { std::string encodings[3] = {"Huffman", "Huffman + REnc", "Huffman + PREnc"}; std::cerr << "Used options:" << std::endl; std::cerr << "\tText phrase table will be read from: " << m_inPath << std::endl; std::cerr << "\tOutput phrase table will be written to: " << m_outPath << std::endl; std::cerr << "\tStep size for source landmark phrases: 2^" << m_orderBits << "=" << (1ul << m_orderBits) << std::endl; std::cerr << "\tSource phrase fingerprint size: " << m_fingerPrintBits << " bits / P(fp)=" << (float(1)/(1ul << m_fingerPrintBits)) << std::endl; std::cerr << "\tSelected target phrase encoding: " << encodings[m_coding] << std::endl; if(m_coding == PREnc) { std::cerr << "\tMaxiumum allowed rank for PREnc: "; if(!m_maxRank) std::cerr << "unlimited" << std::endl; else std::cerr << m_maxRank << std::endl; } std::cerr << "\tNumber of score components in phrase table: " << m_numScoreComponent << std::endl; std::cerr << "\tSingle Huffman code set for score components: " << (m_multipleScoreTrees ? "no" : "yes") << std::endl; std::cerr << "\tUsing score quantization: "; if(m_quantize) std::cerr << m_quantize << " best" << std::endl; else std::cerr << "no" << std::endl; std::cerr << "\tExplicitly included alignment information: " << (m_useAlignmentInfo ? "yes" : "no") << std::endl; #ifdef WITH_THREADS std::cerr << "\tRunning with " << m_threads << " threads" << std::endl; #endif std::cerr << std::endl; } void PhraseTableCreator::Save() { // Save type of encoding ThrowingFwrite(&m_coding, sizeof(m_coding), 1, m_outFile); ThrowingFwrite(&m_numScoreComponent, sizeof(m_numScoreComponent), 1, m_outFile); ThrowingFwrite(&m_useAlignmentInfo, sizeof(m_useAlignmentInfo), 1, m_outFile); ThrowingFwrite(&m_maxRank, sizeof(m_maxRank), 1, m_outFile); ThrowingFwrite(&m_maxPhraseLength, sizeof(m_maxPhraseLength), 1, m_outFile); if(m_coding == REnc) { // Save source language symbols for REnc std::vector temp1; temp1.resize(m_sourceSymbolsMap.size()); for(boost::unordered_map::iterator it = m_sourceSymbolsMap.begin(); it != m_sourceSymbolsMap.end(); it++) temp1[it->second] = it->first; std::sort(temp1.begin(), temp1.end()); StringVector sourceSymbols; for(std::vector::iterator it = temp1.begin(); it != temp1.end(); it++) sourceSymbols.push_back(*it); sourceSymbols.save(m_outFile); // Save lexical translation table for REnc size_t size = m_lexicalTableIndex.size(); ThrowingFwrite(&size, sizeof(size_t), 1, m_outFile); ThrowingFwrite(&m_lexicalTableIndex[0], sizeof(size_t), size, m_outFile); size = m_lexicalTable.size(); ThrowingFwrite(&size, sizeof(size_t), 1, m_outFile); ThrowingFwrite(&m_lexicalTable[0], sizeof(SrcTrg), size, m_outFile); } // Save target language symbols std::vector temp2; temp2.resize(m_targetSymbolsMap.size()); for(boost::unordered_map::iterator it = m_targetSymbolsMap.begin(); it != m_targetSymbolsMap.end(); it++) temp2[it->second] = it->first; StringVector targetSymbols; for(std::vector::iterator it = temp2.begin(); it != temp2.end(); it++) targetSymbols.push_back(*it); targetSymbols.save(m_outFile); // Save Huffman codes for target language symbols m_symbolTree->Save(m_outFile); // Save number of Huffman code sets for scores and // save Huffman code sets ThrowingFwrite(&m_multipleScoreTrees, sizeof(m_multipleScoreTrees), 1, m_outFile); size_t numScoreTrees = m_scoreTrees.size(); for(size_t i = 0; i < numScoreTrees; i++) m_scoreTrees[i]->Save(m_outFile); // Save Huffman codes for alignments if(m_useAlignmentInfo) m_alignTree->Save(m_outFile); // Save compressed target phrase collections m_compressedTargetPhrases->save(m_outFile); } void PhraseTableCreator::LoadLexicalTable(std::string filePath) { std::vector t_lexTable; std::cerr << "Reading in lexical table for Rank Encoding" << std::endl; std::ifstream lexIn(filePath.c_str(), std::ifstream::in); std::string src, trg; float prob; // Reading in the translation probability lexicon std::cerr << "\tLoading from " << filePath << std::endl; while(lexIn >> trg >> src >> prob) { t_lexTable.push_back(SrcTrgProb(SrcTrgString(src, trg), prob)); AddSourceSymbolId(src); AddTargetSymbolId(trg); } // Sorting lexicon by source words by lexicographical order, corresponding // target words by decreasing probability. std::cerr << "\tSorting according to translation rank" << std::endl; std::sort(t_lexTable.begin(), t_lexTable.end(), SrcTrgProbSorter()); // Re-assigning source word ids in lexicographical order std::vector temp1; temp1.resize(m_sourceSymbolsMap.size()); for(boost::unordered_map::iterator it = m_sourceSymbolsMap.begin(); it != m_sourceSymbolsMap.end(); it++) temp1[it->second] = it->first; std::sort(temp1.begin(), temp1.end()); for(size_t i = 0; i < temp1.size(); i++) m_sourceSymbolsMap[temp1[i]] = i; // Building the lexicon based on source and target word ids std::string srcWord = ""; size_t srcIdx = 0; for(std::vector::iterator it = t_lexTable.begin(); it != t_lexTable.end(); it++) { // If we encounter a new source word if(it->first.first != srcWord) { srcIdx = GetSourceSymbolId(it->first.first); // Store position of first translation if(srcIdx >= m_lexicalTableIndex.size()) m_lexicalTableIndex.resize(srcIdx + 1); m_lexicalTableIndex[srcIdx] = m_lexicalTable.size(); } // Store pair of source word and target word size_t trgIdx = GetTargetSymbolId(it->first.second); m_lexicalTable.push_back(SrcTrg(srcIdx, trgIdx)); srcWord = it->first.first; } std::cerr << "\tLoaded " << m_lexicalTable.size() << " lexical pairs" << std::endl; std::cerr << std::endl; } void PhraseTableCreator::CreateRankHash() { InputFileStream inFile(m_inPath); #ifdef WITH_THREADS boost::thread_group threads; for (size_t i = 0; i < m_threads; ++i) { RankingTask* rt = new RankingTask(inFile, *this); threads.create_thread(*rt); } threads.join_all(); #else RankingTask* rt = new RankingTask(inFile, *this); (*rt)(); delete rt; #endif FlushRankedQueue(true); } inline std::string PhraseTableCreator::MakeSourceKey(std::string &source) { return source + " " + m_separator + " "; } inline std::string PhraseTableCreator::MakeSourceTargetKey(std::string &source, std::string &target) { return source + " " + m_separator + " " + target + " " + m_separator + " "; } void PhraseTableCreator::EncodeTargetPhrases() { InputFileStream inFile(m_inPath); #ifdef WITH_THREADS boost::thread_group threads; for (size_t i = 0; i < m_threads; ++i) { EncodingTask* et = new EncodingTask(inFile, *this); threads.create_thread(*et); } threads.join_all(); #else EncodingTask* et = new EncodingTask(inFile, *this); (*et)(); delete et; #endif FlushEncodedQueue(true); } void PhraseTableCreator::CompressTargetPhrases() { #ifdef WITH_THREADS boost::thread_group threads; for (size_t i = 0; i < m_threads; ++i) { CompressionTask* ct = new CompressionTask(*m_encodedTargetPhrases, *this); threads.create_thread(*ct); } threads.join_all(); #else CompressionTask* ct = new CompressionTask(*m_encodedTargetPhrases, *this); (*ct)(); delete ct; #endif FlushCompressedQueue(true); } void PhraseTableCreator::CalcHuffmanCodes() { std::cerr << "\tCreating Huffman codes for " << m_symbolCounter.Size() << " target phrase symbols" << std::endl; m_symbolTree = new SymbolTree(m_symbolCounter.Begin(), m_symbolCounter.End()); std::vector::iterator treeIt = m_scoreTrees.begin(); for(std::vector::iterator it = m_scoreCounters.begin(); it != m_scoreCounters.end(); it++) { if(m_quantize) (*it)->Quantize(m_quantize); std::cerr << "\tCreating Huffman codes for " << (*it)->Size() << " scores" << std::endl; *treeIt = new ScoreTree((*it)->Begin(), (*it)->End()); treeIt++; } if(m_useAlignmentInfo) { std::cerr << "\tCreating Huffman codes for " << m_alignCounter.Size() << " alignment points" << std::endl; m_alignTree = new AlignTree(m_alignCounter.Begin(), m_alignCounter.End()); } std::cerr << std::endl; } void PhraseTableCreator::AddSourceSymbolId(std::string& symbol) { #ifdef WITH_THREADS boost::mutex::scoped_lock lock(m_mutex); #endif if(m_sourceSymbolsMap.count(symbol) == 0) { unsigned value = m_sourceSymbolsMap.size(); m_sourceSymbolsMap[symbol] = value; } } void PhraseTableCreator::AddTargetSymbolId(std::string& symbol) { #ifdef WITH_THREADS boost::mutex::scoped_lock lock(m_mutex); #endif if(m_targetSymbolsMap.count(symbol) == 0) { unsigned value = m_targetSymbolsMap.size(); m_targetSymbolsMap[symbol] = value; } } unsigned PhraseTableCreator::GetSourceSymbolId(std::string& symbol) { #ifdef WITH_THREADS boost::mutex::scoped_lock lock(m_mutex); #endif boost::unordered_map::iterator it = m_sourceSymbolsMap.find(symbol); if(it != m_sourceSymbolsMap.end()) return it->second; else return m_sourceSymbolsMap.size(); } unsigned PhraseTableCreator::GetTargetSymbolId(std::string& symbol) { #ifdef WITH_THREADS boost::mutex::scoped_lock lock(m_mutex); #endif boost::unordered_map::iterator it = m_targetSymbolsMap.find(symbol); UTIL_THROW_IF2(it == m_targetSymbolsMap.end(), "No id found for target symbol: " << symbol); return it->second; } unsigned PhraseTableCreator::GetOrAddTargetSymbolId(std::string& symbol) { #ifdef WITH_THREADS boost::mutex::scoped_lock lock(m_mutex); #endif boost::unordered_map::iterator it = m_targetSymbolsMap.find(symbol); if(it != m_targetSymbolsMap.end()) return it->second; else { unsigned value = m_targetSymbolsMap.size(); m_targetSymbolsMap[symbol] = value; return value; } } unsigned PhraseTableCreator::GetRank(unsigned srcIdx, unsigned trgIdx) { size_t srcTrgIdx = m_lexicalTableIndex[srcIdx]; while(srcTrgIdx < m_lexicalTable.size() && srcIdx == m_lexicalTable[srcTrgIdx].first && m_lexicalTable[srcTrgIdx].second != trgIdx) srcTrgIdx++; if(srcTrgIdx < m_lexicalTable.size() && m_lexicalTable[srcTrgIdx].second == trgIdx) return srcTrgIdx - m_lexicalTableIndex[srcIdx]; else return m_lexicalTable.size(); } unsigned PhraseTableCreator::EncodeREncSymbol1(unsigned trgIdx) { assert((~(1 << 31)) > trgIdx); return trgIdx; } unsigned PhraseTableCreator::EncodeREncSymbol2(unsigned pos, unsigned rank) { unsigned symbol = rank; symbol |= 1 << 30; symbol |= pos << 24; return symbol; } unsigned PhraseTableCreator::EncodeREncSymbol3(unsigned rank) { unsigned symbol = rank; symbol |= 2 << 30; return symbol; } unsigned PhraseTableCreator::EncodePREncSymbol1(unsigned trgIdx) { assert((~(1 << 31)) > trgIdx); return trgIdx; } unsigned PhraseTableCreator::EncodePREncSymbol2(int left, int right, unsigned rank) { // "left" and "right" must be smaller than 2^5 // "rank" must be smaller than 2^19 left = left + 32; right = right + 32; assert(64 > left); assert(64 > right); assert(524288 > rank); unsigned symbol = 0; symbol |= 1 << 31; symbol |= left << 25; symbol |= right << 19; symbol |= rank; return symbol; } void PhraseTableCreator::EncodeTargetPhraseNone(std::vector& t, std::ostream& os) { std::stringstream encodedTargetPhrase; size_t j = 0; while(j < t.size()) { unsigned targetSymbolId = GetOrAddTargetSymbolId(t[j]); m_symbolCounter.Increase(targetSymbolId); os.write((char*)&targetSymbolId, sizeof(targetSymbolId)); j++; } unsigned stopSymbolId = GetTargetSymbolId(m_phraseStopSymbol); os.write((char*)&stopSymbolId, sizeof(stopSymbolId)); m_symbolCounter.Increase(stopSymbolId); } void PhraseTableCreator::EncodeTargetPhraseREnc(std::vector& s, std::vector& t, std::set& a, std::ostream& os) { std::stringstream encodedTargetPhrase; std::vector > a2(t.size()); for(std::set::iterator it = a.begin(); it != a.end(); it++) a2[it->second].push_back(it->first); for(size_t i = 0; i < t.size(); i++) { unsigned idxTarget = GetOrAddTargetSymbolId(t[i]); unsigned encodedSymbol = -1; unsigned bestSrcPos = s.size(); unsigned bestDiff = s.size(); unsigned bestRank = m_lexicalTable.size(); unsigned badRank = m_lexicalTable.size(); for(std::vector::iterator it = a2[i].begin(); it != a2[i].end(); it++) { unsigned idxSource = GetSourceSymbolId(s[*it]); size_t r = GetRank(idxSource, idxTarget); if(r != badRank) { if(r < bestRank) { bestRank = r; bestSrcPos = *it; bestDiff = abs(*it-i); } else if(r == bestRank && unsigned(abs(*it-i)) < bestDiff) { bestSrcPos = *it; bestDiff = abs(*it-i); } } } if(bestRank != badRank && bestSrcPos < s.size()) { if(bestSrcPos == i) encodedSymbol = EncodeREncSymbol3(bestRank); else encodedSymbol = EncodeREncSymbol2(bestSrcPos, bestRank); a.erase(AlignPoint(bestSrcPos, i)); } else { encodedSymbol = EncodeREncSymbol1(idxTarget); } os.write((char*)&encodedSymbol, sizeof(encodedSymbol)); m_symbolCounter.Increase(encodedSymbol); } unsigned stopSymbolId = GetTargetSymbolId(m_phraseStopSymbol); unsigned encodedSymbol = EncodeREncSymbol1(stopSymbolId); os.write((char*)&encodedSymbol, sizeof(encodedSymbol)); m_symbolCounter.Increase(encodedSymbol); } void PhraseTableCreator::EncodeTargetPhrasePREnc(std::vector& s, std::vector& t, std::set& a, size_t ownRank, std::ostream& os) { std::vector encodedSymbols(t.size()); std::vector encodedSymbolsLengths(t.size(), 0); ConsistentPhrases cp(s.size(), t.size(), a); while(!cp.Empty()) { ConsistentPhrases::Phrase p = cp.Pop(); std::stringstream key1; key1 << s[p.i]; for(int i = p.i+1; i < p.i+p.m; i++) key1 << " " << s[i]; std::stringstream key2; key2 << t[p.j]; for(int i = p.j+1; i < p.j+p.n; i++) key2 << " " << t[i]; int rank = -1; std::string key1Str = key1.str(), key2Str = key2.str(); size_t idx = m_rnkHash[MakeSourceTargetKey(key1Str, key2Str)]; if(idx != m_rnkHash.GetSize()) rank = m_ranks[idx]; if(rank >= 0 && (m_maxRank == 0 || unsigned(rank) < m_maxRank)) { if(unsigned(p.m) != s.size() || unsigned(rank) < ownRank) { std::stringstream encodedSymbol; encodedSymbols[p.j] = EncodePREncSymbol2(p.i-p.j, s.size()-(p.i+p.m), rank); encodedSymbolsLengths[p.j] = p.n; std::set tAlignment; for(std::set::iterator it = a.begin(); it != a.end(); it++) if(it->first < p.i || it->first >= p.i + p.m || it->second < p.j || it->second >= p.j + p.n) tAlignment.insert(*it); a = tAlignment; cp.RemoveOverlap(p); } } } std::stringstream encodedTargetPhrase; size_t j = 0; while(j < t.size()) { if(encodedSymbolsLengths[j] > 0) { unsigned encodedSymbol = encodedSymbols[j]; m_symbolCounter.Increase(encodedSymbol); os.write((char*)&encodedSymbol, sizeof(encodedSymbol)); j += encodedSymbolsLengths[j]; } else { unsigned targetSymbolId = GetOrAddTargetSymbolId(t[j]); unsigned encodedSymbol = EncodePREncSymbol1(targetSymbolId); m_symbolCounter.Increase(encodedSymbol); os.write((char*)&encodedSymbol, sizeof(encodedSymbol)); j++; } } unsigned stopSymbolId = GetTargetSymbolId(m_phraseStopSymbol); unsigned encodedSymbol = EncodePREncSymbol1(stopSymbolId); os.write((char*)&encodedSymbol, sizeof(encodedSymbol)); m_symbolCounter.Increase(encodedSymbol); } void PhraseTableCreator::EncodeScores(std::vector& scores, std::ostream& os) { size_t c = 0; float score; while(c < scores.size()) { score = scores[c]; score = FloorScore(TransformScore(score)); os.write((char*)&score, sizeof(score)); m_scoreCounters[m_multipleScoreTrees ? c : 0]->Increase(score); c++; } } void PhraseTableCreator::EncodeAlignment(std::set& alignment, std::ostream& os) { for(std::set::iterator it = alignment.begin(); it != alignment.end(); it++) { os.write((char*)&(*it), sizeof(AlignPoint)); m_alignCounter.Increase(*it); } AlignPoint stop(-1, -1); os.write((char*) &stop, sizeof(AlignPoint)); m_alignCounter.Increase(stop); } std::string PhraseTableCreator::EncodeLine(std::vector& tokens, size_t ownRank) { std::string sourcePhraseStr = tokens[0]; std::string targetPhraseStr = tokens[1]; std::string scoresStr = tokens[2]; std::string alignmentStr = ""; if(tokens.size() > 3) alignmentStr = tokens[3]; std::vector s = Tokenize(sourcePhraseStr); size_t phraseLength = s.size(); if(m_maxPhraseLength < phraseLength) m_maxPhraseLength = phraseLength; std::vector t = Tokenize(targetPhraseStr); std::vector scores = Tokenize(scoresStr); if(scores.size() != m_numScoreComponent) { std::stringstream strme; strme << "Error: Wrong number of scores detected (" << scores.size() << " != " << m_numScoreComponent << ") :" << std::endl; strme << "Line: " << tokens[0] << " ||| " << tokens[1] << " ||| " << tokens[2] << " ..." << std::endl; UTIL_THROW2(strme.str()); } std::set a; if(m_coding != None || m_useAlignmentInfo) { std::vector positions = Tokenize(alignmentStr, " \t-"); for(size_t i = 0; i < positions.size(); i += 2) { a.insert(AlignPoint(positions[i], positions[i+1])); } } std::stringstream encodedTargetPhrase; if(m_coding == PREnc) { EncodeTargetPhrasePREnc(s, t, a, ownRank, encodedTargetPhrase); } else if(m_coding == REnc) { EncodeTargetPhraseREnc(s, t, a, encodedTargetPhrase); } else { EncodeTargetPhraseNone(t, encodedTargetPhrase); } EncodeScores(scores, encodedTargetPhrase); if(m_useAlignmentInfo) EncodeAlignment(a, encodedTargetPhrase); return encodedTargetPhrase.str(); } std::string PhraseTableCreator::CompressEncodedCollection(std::string encodedCollection) { enum EncodeState { ReadSymbol, ReadScore, ReadAlignment, EncodeSymbol, EncodeScore, EncodeAlignment }; EncodeState state = ReadSymbol; unsigned phraseStopSymbolId; if(m_coding == REnc) phraseStopSymbolId = EncodeREncSymbol1(GetTargetSymbolId(m_phraseStopSymbol)); else if(m_coding == PREnc) phraseStopSymbolId = EncodePREncSymbol1(GetTargetSymbolId(m_phraseStopSymbol)); else phraseStopSymbolId = GetTargetSymbolId(m_phraseStopSymbol); AlignPoint alignStopSymbol(-1, -1); std::stringstream encodedStream(encodedCollection); encodedStream.unsetf(std::ios::skipws); std::string compressedEncodedCollection; BitWrapper<> bitStream(compressedEncodedCollection); unsigned symbol; float score; size_t currScore = 0; AlignPoint alignPoint; while(encodedStream) { switch(state) { case ReadSymbol: encodedStream.read((char*) &symbol, sizeof(unsigned)); state = EncodeSymbol; break; case ReadScore: if(currScore == m_numScoreComponent) { currScore = 0; if(m_useAlignmentInfo) state = ReadAlignment; else state = ReadSymbol; } else { encodedStream.read((char*) &score, sizeof(float)); currScore++; state = EncodeScore; } break; case ReadAlignment: encodedStream.read((char*) &alignPoint, sizeof(AlignPoint)); state = EncodeAlignment; break; case EncodeSymbol: state = (symbol == phraseStopSymbolId) ? ReadScore : ReadSymbol; m_symbolTree->Put(bitStream, symbol); break; case EncodeScore: { state = ReadScore; size_t idx = m_multipleScoreTrees ? currScore-1 : 0; if(m_quantize) score = m_scoreCounters[idx]->LowerBound(score); m_scoreTrees[idx]->Put(bitStream, score); } break; case EncodeAlignment: state = (alignPoint == alignStopSymbol) ? ReadSymbol : ReadAlignment; m_alignTree->Put(bitStream, alignPoint); break; } } return compressedEncodedCollection; } void PhraseTableCreator::AddRankedLine(PackedItem& pi) { m_queue.push(pi); } void PhraseTableCreator::FlushRankedQueue(bool force) { size_t step = 1ul << 10; while(!m_queue.empty() && m_lastFlushedLine + 1 == m_queue.top().GetLine()) { m_lastFlushedLine++; PackedItem pi = m_queue.top(); m_queue.pop(); if(m_lastSourceRange.size() == step) { m_rnkHash.AddRange(m_lastSourceRange); m_lastSourceRange.clear(); } if(m_lastFlushedSourcePhrase != pi.GetSrc()) { if(m_rankQueue.size()) { m_lastFlushedSourceNum++; if(m_lastFlushedSourceNum % 100000 == 0) { std::cerr << "."; } if(m_lastFlushedSourceNum % 5000000 == 0) { std::cerr << "[" << m_lastFlushedSourceNum << "]" << std::endl; } m_ranks.resize(m_lastFlushedLine + 1); int r = 0; while(!m_rankQueue.empty()) { m_ranks[m_rankQueue.top().second] = r++; m_rankQueue.pop(); } } } m_lastSourceRange.push_back(pi.GetTrg()); m_rankQueue.push(std::make_pair(pi.GetScore(), pi.GetLine())); m_lastFlushedSourcePhrase = pi.GetSrc(); } if(force) { if(!m_lastSourceRange.empty()) { m_rnkHash.AddRange(m_lastSourceRange); m_lastSourceRange.clear(); } #ifdef WITH_THREADS m_rnkHash.WaitAll(); #endif m_ranks.resize(m_lastFlushedLine + 1); int r = 0; while(!m_rankQueue.empty()) { m_ranks[m_rankQueue.top().second] = r++; m_rankQueue.pop(); } m_lastFlushedLine = -1; m_lastFlushedSourceNum = 0; std::cerr << std::endl << std::endl; } } void PhraseTableCreator::AddEncodedLine(PackedItem& pi) { m_queue.push(pi); } void PhraseTableCreator::FlushEncodedQueue(bool force) { while(!m_queue.empty() && m_lastFlushedLine + 1 == m_queue.top().GetLine()) { PackedItem pi = m_queue.top(); m_queue.pop(); m_lastFlushedLine++; if(m_lastFlushedSourcePhrase != pi.GetSrc()) { if(m_lastCollection.size()) { std::stringstream targetPhraseCollection; for(std::vector::iterator it = m_lastCollection.begin(); it != m_lastCollection.end(); it++) targetPhraseCollection << *it; m_lastSourceRange.push_back(MakeSourceKey(m_lastFlushedSourcePhrase)); m_encodedTargetPhrases->push_back(targetPhraseCollection.str()); m_lastFlushedSourceNum++; if(m_lastFlushedSourceNum % 100000 == 0) std::cerr << "."; if(m_lastFlushedSourceNum % 5000000 == 0) std::cerr << "[" << m_lastFlushedSourceNum << "]" << std::endl; m_lastCollection.clear(); } } if(m_lastSourceRange.size() == (1ul << m_orderBits)) { m_srcHash.AddRange(m_lastSourceRange); m_srcHash.SaveLastRange(); m_srcHash.DropLastRange(); m_lastSourceRange.clear(); } m_lastFlushedSourcePhrase = pi.GetSrc(); if(m_coding == PREnc) { if(m_lastCollection.size() <= pi.GetRank()) m_lastCollection.resize(pi.GetRank() + 1); m_lastCollection[pi.GetRank()] = pi.GetTrg(); } else { m_lastCollection.push_back(pi.GetTrg()); } } if(force) { if(!m_lastSourceRange.size() || m_lastSourceRange.back() != m_lastFlushedSourcePhrase) m_lastSourceRange.push_back(MakeSourceKey(m_lastFlushedSourcePhrase)); if(m_lastCollection.size()) { std::stringstream targetPhraseCollection; for(std::vector::iterator it = m_lastCollection.begin(); it != m_lastCollection.end(); it++) targetPhraseCollection << *it; m_encodedTargetPhrases->push_back(targetPhraseCollection.str()); m_lastCollection.clear(); } if(!m_lastSourceRange.empty()) { m_srcHash.AddRange(m_lastSourceRange); m_lastSourceRange.clear(); } #ifdef WITH_THREADS m_srcHash.WaitAll(); #endif m_srcHash.SaveLastRange(); m_srcHash.DropLastRange(); m_srcHash.FinalizeSave(); m_lastFlushedLine = -1; m_lastFlushedSourceNum = 0; std::cerr << std::endl << std::endl; } } void PhraseTableCreator::AddCompressedCollection(PackedItem& pi) { m_queue.push(pi); } void PhraseTableCreator::FlushCompressedQueue(bool force) { if(force || m_queue.size() > 10000) { while(!m_queue.empty() && m_lastFlushedLine + 1 == m_queue.top().GetLine()) { PackedItem pi = m_queue.top(); m_queue.pop(); m_lastFlushedLine++; m_compressedTargetPhrases->push_back(pi.GetTrg()); if((pi.GetLine()+1) % 100000 == 0) std::cerr << "."; if((pi.GetLine()+1) % 5000000 == 0) std::cerr << "[" << (pi.GetLine()+1) << "]" << std::endl; } } if(force) { m_lastFlushedLine = -1; std::cerr << std::endl << std::endl; } } //****************************************************************************// size_t RankingTask::m_lineNum = 0; #ifdef WITH_THREADS boost::mutex RankingTask::m_mutex; boost::mutex RankingTask::m_fileMutex; #endif RankingTask::RankingTask(InputFileStream& inFile, PhraseTableCreator& creator) : m_inFile(inFile), m_creator(creator) {} void RankingTask::operator()() { size_t lineNum = 0; std::vector lines; size_t max_lines = 1000; lines.reserve(max_lines); { #ifdef WITH_THREADS boost::mutex::scoped_lock lock(m_fileMutex); #endif std::string line; while(lines.size() < max_lines && std::getline(m_inFile, line)) lines.push_back(line); lineNum = m_lineNum; m_lineNum += lines.size(); } std::vector result; result.reserve(max_lines); while(lines.size()) { for(size_t i = 0; i < lines.size(); i++) { std::vector tokens; Moses::TokenizeMultiCharSeparator(tokens, lines[i], m_creator.m_separator); for(std::vector::iterator it = tokens.begin(); it != tokens.end(); it++) *it = Moses::Trim(*it); if(tokens.size() < 4) { std::stringstream strme; strme << "Error: It seems the following line has a wrong format:" << std::endl; strme << "Line " << i << ": " << lines[i] << std::endl; UTIL_THROW2(strme.str()); } if(tokens[3].size() <= 1 && m_creator.m_coding != PhraseTableCreator::None) { std::stringstream strme; strme << "Error: It seems the following line contains no alignment information, " << std::endl; strme << "but you are using "; strme << (m_creator.m_coding == PhraseTableCreator::PREnc ? "PREnc" : "REnc"); strme << " encoding which makes use of alignment data. " << std::endl; strme << "Use -encoding None" << std::endl; strme << "Line " << i << ": " << lines[i] << std::endl; UTIL_THROW2(strme.str()); } std::vector scores = Tokenize(tokens[2]); if(scores.size() != m_creator.m_numScoreComponent) { std::stringstream strme; strme << "Error: It seems the following line has a wrong number of scores (" << scores.size() << " != " << m_creator.m_numScoreComponent << ") :" << std::endl; strme << "Line " << i << ": " << lines[i] << std::endl; UTIL_THROW2(strme.str()); } float sortScore = scores[m_creator.m_sortScoreIndex]; std::string key1 = m_creator.MakeSourceKey(tokens[0]); std::string key2 = m_creator.MakeSourceTargetKey(tokens[0], tokens[1]); PackedItem packedItem(lineNum + i, key1, key2, 0, sortScore); result.push_back(packedItem); } lines.clear(); { #ifdef WITH_THREADS boost::mutex::scoped_lock lock(m_mutex); #endif for(size_t i = 0; i < result.size(); i++) m_creator.AddRankedLine(result[i]); m_creator.FlushRankedQueue(); } result.clear(); lines.reserve(max_lines); result.reserve(max_lines); #ifdef WITH_THREADS boost::mutex::scoped_lock lock(m_fileMutex); #endif std::string line; while(lines.size() < max_lines && std::getline(m_inFile, line)) lines.push_back(line); lineNum = m_lineNum; m_lineNum += lines.size(); } } size_t EncodingTask::m_lineNum = 0; #ifdef WITH_THREADS boost::mutex EncodingTask::m_mutex; boost::mutex EncodingTask::m_fileMutex; #endif EncodingTask::EncodingTask(InputFileStream& inFile, PhraseTableCreator& creator) : m_inFile(inFile), m_creator(creator) {} void EncodingTask::operator()() { size_t lineNum = 0; std::vector lines; size_t max_lines = 1000; lines.reserve(max_lines); { #ifdef WITH_THREADS boost::mutex::scoped_lock lock(m_fileMutex); #endif std::string line; while(lines.size() < max_lines && std::getline(m_inFile, line)) lines.push_back(line); lineNum = m_lineNum; m_lineNum += lines.size(); } std::vector result; result.reserve(max_lines); while(lines.size()) { for(size_t i = 0; i < lines.size(); i++) { std::vector tokens; Moses::TokenizeMultiCharSeparator(tokens, lines[i], m_creator.m_separator); for(std::vector::iterator it = tokens.begin(); it != tokens.end(); it++) *it = Moses::Trim(*it); if(tokens.size() < 3) { std::stringstream strme; strme << "Error: It seems the following line has a wrong format:" << std::endl; strme << "Line " << i << ": " << lines[i] << std::endl; UTIL_THROW2(strme.str()); } if(tokens.size() > 3 && tokens[3].size() <= 1 && m_creator.m_coding != PhraseTableCreator::None) { std::stringstream strme; strme << "Error: It seems the following line contains no alignment information, " << std::endl; strme << "but you are using "; strme << (m_creator.m_coding == PhraseTableCreator::PREnc ? "PREnc" : "REnc"); strme << " encoding which makes use of alignment data. " << std::endl; strme << "Use -encoding None" << std::endl; strme << "Line " << i << ": " << lines[i] << std::endl; UTIL_THROW2(strme.str()); } size_t ownRank = 0; if(m_creator.m_coding == PhraseTableCreator::PREnc) ownRank = m_creator.m_ranks[lineNum + i]; std::string encodedLine = m_creator.EncodeLine(tokens, ownRank); PackedItem packedItem(lineNum + i, tokens[0], encodedLine, ownRank); result.push_back(packedItem); } lines.clear(); { #ifdef WITH_THREADS boost::mutex::scoped_lock lock(m_mutex); #endif for(size_t i = 0; i < result.size(); i++) m_creator.AddEncodedLine(result[i]); m_creator.FlushEncodedQueue(); } result.clear(); lines.reserve(max_lines); result.reserve(max_lines); #ifdef WITH_THREADS boost::mutex::scoped_lock lock(m_fileMutex); #endif std::string line; while(lines.size() < max_lines && std::getline(m_inFile, line)) lines.push_back(line); lineNum = m_lineNum; m_lineNum += lines.size(); } } //****************************************************************************// size_t CompressionTask::m_collectionNum = 0; #ifdef WITH_THREADS boost::mutex CompressionTask::m_mutex; #endif CompressionTask::CompressionTask(StringVectorTemp& encodedCollections, PhraseTableCreator& creator) : m_encodedCollections(encodedCollections), m_creator(creator) {} void CompressionTask::operator()() { size_t collectionNum; { #ifdef WITH_THREADS boost::mutex::scoped_lock lock(m_mutex); #endif collectionNum = m_collectionNum; m_collectionNum++; } while(collectionNum < m_encodedCollections.size()) { std::string collection = m_encodedCollections[collectionNum]; std::string compressedCollection = m_creator.CompressEncodedCollection(collection); std::string dummy; PackedItem packedItem(collectionNum, dummy, compressedCollection, 0); #ifdef WITH_THREADS boost::mutex::scoped_lock lock(m_mutex); #endif m_creator.AddCompressedCollection(packedItem); m_creator.FlushCompressedQueue(); collectionNum = m_collectionNum; m_collectionNum++; } } //****************************************************************************// PackedItem::PackedItem(long line, std::string sourcePhrase, std::string packedTargetPhrase, size_t rank, float score) : m_line(line), m_sourcePhrase(sourcePhrase), m_packedTargetPhrase(packedTargetPhrase), m_rank(rank), m_score(score) {} long PackedItem::GetLine() const { return m_line; } const std::string& PackedItem::GetSrc() const { return m_sourcePhrase; } const std::string& PackedItem::GetTrg() const { return m_packedTargetPhrase; } size_t PackedItem::GetRank() const { return m_rank; } float PackedItem::GetScore() const { return m_score; } }