From 004cc0d3b1f66839c49603c8651da7c097336c8d Mon Sep 17 00:00:00 2001 From: Marcin Junczys-Dowmunt Date: Thu, 2 Aug 2012 18:32:55 +0200 Subject: [PATCH] Adding source file for compact phrase table --- moses/src/CompactPT/BlockHashIndex.cpp | 307 +++++ moses/src/CompactPT/BlockHashIndex.h | 197 +++ moses/src/CompactPT/CanonicalHuffman.h | 322 +++++ .../src/CompactPT/CmphStringVectorAdapter.cpp | 69 + moses/src/CompactPT/CmphStringVectorAdapter.h | 81 ++ moses/src/CompactPT/ConsistantPhrases.h | 114 ++ .../LexicalReorderingTableCompact.cpp | 135 ++ .../CompactPT/LexicalReorderingTableCompact.h | 56 + .../LexicalReorderingTableCreator.cpp | 388 ++++++ .../CompactPT/LexicalReorderingTableCreator.h | 117 ++ moses/src/CompactPT/ListCoders.h | 291 ++++ moses/src/CompactPT/MmapAllocator.h | 175 +++ moses/src/CompactPT/MonotonicVector.h | 227 ++++ moses/src/CompactPT/MurmurHash3.cpp | 335 +++++ moses/src/CompactPT/MurmurHash3.h | 37 + moses/src/CompactPT/PackedArray.h | 177 +++ moses/src/CompactPT/PhraseDecoder.cpp | 469 +++++++ moses/src/CompactPT/PhraseDecoder.h | 133 ++ .../src/CompactPT/PhraseDictionaryCompact.cpp | 188 +++ moses/src/CompactPT/PhraseDictionaryCompact.h | 119 ++ moses/src/CompactPT/PhraseTableCreator.cpp | 1195 +++++++++++++++++ moses/src/CompactPT/PhraseTableCreator.h | 401 ++++++ moses/src/CompactPT/StringVector.h | 600 +++++++++ .../CompactPT/TargetPhraseCollectionCache.h | 161 +++ 24 files changed, 6294 insertions(+) create mode 100644 moses/src/CompactPT/BlockHashIndex.cpp create mode 100644 moses/src/CompactPT/BlockHashIndex.h create mode 100644 moses/src/CompactPT/CanonicalHuffman.h create mode 100644 moses/src/CompactPT/CmphStringVectorAdapter.cpp create mode 100644 moses/src/CompactPT/CmphStringVectorAdapter.h create mode 100644 moses/src/CompactPT/ConsistantPhrases.h create mode 100644 moses/src/CompactPT/LexicalReorderingTableCompact.cpp create mode 100644 moses/src/CompactPT/LexicalReorderingTableCompact.h create mode 100644 moses/src/CompactPT/LexicalReorderingTableCreator.cpp create mode 100644 moses/src/CompactPT/LexicalReorderingTableCreator.h create mode 100644 moses/src/CompactPT/ListCoders.h create mode 100644 moses/src/CompactPT/MmapAllocator.h create mode 100644 moses/src/CompactPT/MonotonicVector.h create mode 100644 moses/src/CompactPT/MurmurHash3.cpp create mode 100644 moses/src/CompactPT/MurmurHash3.h create mode 100644 moses/src/CompactPT/PackedArray.h create mode 100644 moses/src/CompactPT/PhraseDecoder.cpp create mode 100644 moses/src/CompactPT/PhraseDecoder.h create mode 100644 moses/src/CompactPT/PhraseDictionaryCompact.cpp create mode 100644 moses/src/CompactPT/PhraseDictionaryCompact.h create mode 100644 moses/src/CompactPT/PhraseTableCreator.cpp create mode 100644 moses/src/CompactPT/PhraseTableCreator.h create mode 100644 moses/src/CompactPT/StringVector.h create mode 100644 moses/src/CompactPT/TargetPhraseCollectionCache.h diff --git a/moses/src/CompactPT/BlockHashIndex.cpp b/moses/src/CompactPT/BlockHashIndex.cpp new file mode 100644 index 000000000..5b1e8272e --- /dev/null +++ b/moses/src/CompactPT/BlockHashIndex.cpp @@ -0,0 +1,307 @@ +#include "BlockHashIndex.h" + +namespace Moses +{ +#ifdef WITH_THREADS +BlockHashIndex::BlockHashIndex(size_t orderBits, size_t fingerPrintBits, + size_t threadsNum) +: m_orderBits(orderBits), m_fingerPrintBits(fingerPrintBits), + m_fileHandle(0), m_fileHandleStart(0), m_algo(CMPH_CHD), m_size(0), + m_lastSaved(-1), m_lastDropped(-1), m_numLoadedRanges(0), + m_threadPool(threadsNum) {} + +BlockHashIndex::BlockHashIndex(size_t orderBits, size_t fingerPrintBits, + CMPH_ALGO algo, size_t threadsNum) +: m_orderBits(orderBits), m_fingerPrintBits(fingerPrintBits), + m_fileHandle(0), m_fileHandleStart(0), m_algo(algo), m_size(0), + m_lastSaved(-1), m_lastDropped(-1), m_numLoadedRanges(0), + m_threadPool(threadsNum) {} +#else +BlockHashIndex::BlockHashIndex(size_t orderBits, size_t fingerPrintBits) +: m_orderBits(orderBits), m_fingerPrintBits(fingerPrintBits), + m_fileHandle(0), m_fileHandleStart(0), m_algo(CMPH_CHD), m_size(0), + m_lastSaved(-1), m_lastDropped(-1), m_numLoadedRanges(0) {} + +BlockHashIndex::BlockHashIndex(size_t orderBits, size_t fingerPrintBits, CMPH_ALGO algo) +: m_orderBits(orderBits), m_fingerPrintBits(fingerPrintBits), + m_fileHandle(0), m_fileHandleStart(0), m_algo(algo), m_size(0), + m_lastSaved(-1), m_lastDropped(-1), m_numLoadedRanges(0) {} +#endif + +BlockHashIndex::~BlockHashIndex() +{ + for(std::vector::iterator it = m_hashes.begin(); + it != m_hashes.end(); it++) + if(*it != 0) + cmph_destroy(*it); + + for(std::vector*>::iterator it = m_arrays.begin(); + it != m_arrays.end(); it++) + if(*it != 0) + delete *it; +} + +size_t BlockHashIndex::GetHash(const char* key) +{ + std::string keyStr(key); + size_t i = std::distance(m_landmarks.begin(), + std::upper_bound(m_landmarks.begin(), + m_landmarks.end(), keyStr)) - 1; + + if(i == 0ul-1) + return GetSize(); + + size_t pos = GetHash(i, key); + if(pos != GetSize()) + return (1ul << m_orderBits) * i + pos; + else + return GetSize(); +} + +size_t BlockHashIndex::GetFprint(const char* key) const +{ + size_t hash; + MurmurHash3_x86_32(key, std::strlen(key), 100000, &hash); + hash &= (1ul << m_fingerPrintBits) - 1; + return hash; +} + +size_t BlockHashIndex::GetHash(size_t i, const char* key) +{ + if(m_hashes[i] == 0) + LoadRange(i); + + size_t idx = cmph_search(m_hashes[i], key, (cmph_uint32) strlen(key)); + + std::pair orderPrint = m_arrays[i]->Get(idx, m_orderBits, m_fingerPrintBits); + m_clocks[i] = clock(); + + if(GetFprint(key) == orderPrint.second) + return orderPrint.first; + else + return GetSize(); +} + +size_t BlockHashIndex::GetHash(std::string key) +{ + return GetHash(key.c_str()); +} + +size_t BlockHashIndex::operator[](std::string key) +{ + return GetHash(key); +} + +size_t BlockHashIndex::operator[](char* key) +{ + return GetHash(key); +} + +size_t BlockHashIndex::Save(std::string filename) +{ + std::FILE* mphf = std::fopen(filename.c_str(), "w"); + size_t size = Save(mphf); + std::fclose(mphf); + return size; +} + +void BlockHashIndex::BeginSave(std::FILE * mphf) +{ + m_fileHandle = mphf; + std::fwrite(&m_orderBits, sizeof(size_t), 1, m_fileHandle); + std::fwrite(&m_fingerPrintBits, sizeof(size_t), 1, m_fileHandle); + + m_fileHandleStart = std::ftell(m_fileHandle); + + size_t relIndexPos = 0; + std::fwrite(&relIndexPos, sizeof(size_t), 1, m_fileHandle); +} + +void BlockHashIndex::SaveRange(size_t i) +{ + if(m_seekIndex.size() <= i) + m_seekIndex.resize(i+1); + m_seekIndex[i] = std::ftell(m_fileHandle) - m_fileHandleStart; + cmph_dump(m_hashes[i], m_fileHandle); + m_arrays[i]->Save(m_fileHandle); +} + +void BlockHashIndex::SaveLastRange() +{ +#ifdef WITH_THREADS + boost::mutex::scoped_lock lock(m_mutex); +#endif + + while(!m_queue.empty() && m_lastSaved + 1 == -m_queue.top()) + { + size_t current = -m_queue.top(); + m_queue.pop(); + SaveRange(current); + m_lastSaved = current; + } +} + +void BlockHashIndex::DropRange(size_t i) +{ + if(m_hashes[i] != 0) + { + cmph_destroy(m_hashes[i]); + m_hashes[i] = 0; + } + if(m_arrays[i] != 0) + { + delete m_arrays[i]; + m_arrays[i] = 0; + m_clocks[i] = 0; + } + m_numLoadedRanges--; +} + +void BlockHashIndex::DropLastRange() +{ +#ifdef WITH_THREADS + boost::mutex::scoped_lock lock(m_mutex); +#endif + + while(m_lastDropped != m_lastSaved) + DropRange(++m_lastDropped); +} + +#ifdef WITH_THREADS +void BlockHashIndex::WaitAll() +{ + m_threadPool.Stop(true); +} +#endif + +size_t BlockHashIndex::FinalizeSave() +{ +#ifdef WITH_THREADS + m_threadPool.Stop(true); +#endif + + SaveLastRange(); + + size_t relIndexPos = std::ftell(m_fileHandle) - m_fileHandleStart; + + std::fseek(m_fileHandle, m_fileHandleStart, SEEK_SET); + std::fwrite(&relIndexPos, sizeof(size_t), 1, m_fileHandle); + + std::fseek(m_fileHandle, m_fileHandleStart + relIndexPos, SEEK_SET); + m_landmarks.save(m_fileHandle); + + size_t seekIndexSize = m_seekIndex.size(); + std::fwrite(&seekIndexSize, sizeof(size_t), 1, m_fileHandle); + std::fwrite(&m_seekIndex[0], sizeof(size_t), seekIndexSize, m_fileHandle); + + std::fwrite(&m_size, sizeof(size_t), 1, m_fileHandle); + + size_t fileHandleStop = std::ftell(m_fileHandle); + return fileHandleStop - m_fileHandleStart + sizeof(m_orderBits) + + sizeof(m_fingerPrintBits); +} + +size_t BlockHashIndex::Save(std::FILE * mphf) +{ + m_queue = std::priority_queue(); + BeginSave(mphf); + for(size_t i = 0; i < m_hashes.size(); i++) + SaveRange(i); + return FinalizeSave(); +} + +size_t BlockHashIndex::LoadIndex(std::FILE* mphf) +{ + m_fileHandle = mphf; + + size_t beginning = std::ftell(mphf); + + std::fread(&m_orderBits, sizeof(size_t), 1, mphf); + std::fread(&m_fingerPrintBits, sizeof(size_t), 1, mphf); + m_fileHandleStart = std::ftell(m_fileHandle); + + size_t relIndexPos; + std::fread(&relIndexPos, sizeof(size_t), 1, mphf); + std::fseek(m_fileHandle, m_fileHandleStart + relIndexPos, SEEK_SET); + + m_landmarks.load(mphf); + + size_t seekIndexSize; + std::fread(&seekIndexSize, sizeof(size_t), 1, m_fileHandle); + m_seekIndex.resize(seekIndexSize); + std::fread(&m_seekIndex[0], sizeof(size_t), seekIndexSize, m_fileHandle); + m_hashes.resize(seekIndexSize, 0); + m_clocks.resize(seekIndexSize, 0); + m_arrays.resize(seekIndexSize, 0); + + std::fread(&m_size, sizeof(size_t), 1, m_fileHandle); + + size_t end = std::ftell(mphf); + + return end - beginning; +} + +void BlockHashIndex::LoadRange(size_t i) +{ +#ifdef WITH_THREADS + boost::mutex::scoped_lock lock(m_mutex); +#endif + std::fseek(m_fileHandle, m_fileHandleStart + m_seekIndex[i], SEEK_SET); + cmph_t* hash = cmph_load(m_fileHandle); + m_arrays[i] = new PairedPackedArray<>(0, m_orderBits, + m_fingerPrintBits); + m_arrays[i]->Load(m_fileHandle); + + m_hashes[i] = hash; + m_clocks[i] = clock(); + + m_numLoadedRanges++; +} + +size_t BlockHashIndex::Load(std::string filename) +{ + std::FILE* mphf = std::fopen(filename.c_str(), "r"); + size_t size = Load(mphf); + std::fclose(mphf); + return size; +} + +size_t BlockHashIndex::Load(std::FILE * mphf) +{ + size_t byteSize = LoadIndex(mphf); + size_t end = std::ftell(mphf); + + for(size_t i = 0; i < m_seekIndex.size(); i++) + LoadRange(i); + std::fseek(m_fileHandle, end, SEEK_SET); + return byteSize; +} + +size_t BlockHashIndex::GetSize() const +{ + return m_size; +} + +void BlockHashIndex::KeepNLastRanges(float ratio, float tolerance) +{ +#ifdef WITH_THREADS + boost::mutex::scoped_lock lock(m_mutex); +#endif + + size_t n = m_hashes.size() * ratio; + if(m_numLoadedRanges > size_t(n * (1 + tolerance))) + { + typedef std::vector > LastLoaded; + LastLoaded lastLoaded; + for(size_t i = 0; i < m_hashes.size(); i++) + if(m_hashes[i] != 0) + lastLoaded.push_back(std::make_pair(m_clocks[i], i)); + + std::sort(lastLoaded.begin(), lastLoaded.end()); + for(LastLoaded::reverse_iterator it = lastLoaded.rbegin() + size_t(n * (1 - tolerance)); + it != lastLoaded.rend(); it++) + DropRange(it->second); + } +} + +} diff --git a/moses/src/CompactPT/BlockHashIndex.h b/moses/src/CompactPT/BlockHashIndex.h new file mode 100644 index 000000000..1dd3cf6fd --- /dev/null +++ b/moses/src/CompactPT/BlockHashIndex.h @@ -0,0 +1,197 @@ +#ifndef moses_BlockHashIndex_h +#define moses_BlockHashIndex_h + +#include +#include +#include +#include +#include +#include + +#include "cmph/src/cmph.h" +#include "MurmurHash3.h" +#include "StringVector.h" +#include "CmphStringVectorAdapter.h" +#include "PackedArray.h" + +#ifdef WITH_THREADS +#include "ThreadPool.h" +#endif + +namespace Moses +{ + +class BlockHashIndex +{ + private: + std::priority_queue m_queue; + + size_t m_orderBits; + size_t m_fingerPrintBits; + + std::FILE* m_fileHandle; + size_t m_fileHandleStart; + + CMPH_ALGO m_algo; + + StringVector m_landmarks; + + std::vector m_hashes; + std::vector m_clocks; + std::vector*> m_arrays; + + std::vector m_seekIndex; + + size_t m_size; + int m_lastSaved; + int m_lastDropped; + size_t m_numLoadedRanges; + +#ifdef WITH_THREADS + ThreadPool m_threadPool; + boost::mutex m_mutex; + + template + class HashTask : public Task + { + public: + HashTask(int id, BlockHashIndex& hash, Keys& keys) + : m_id(id), m_hash(hash), m_keys(new Keys(keys)) {} + + virtual void Run() + { + m_hash.CalcHash(m_id, *m_keys); + } + + virtual ~HashTask() + { + delete m_keys; + } + + private: + int m_id; + BlockHashIndex& m_hash; + Keys* m_keys; + }; +#endif + + size_t GetFprint(const char* key) const; + size_t GetHash(size_t i, const char* key); + + public: +#ifdef WITH_THREADS + BlockHashIndex(size_t orderBits, size_t fingerPrintBits, + size_t threadsNum = 2); + BlockHashIndex(size_t orderBits, size_t fingerPrintBits, CMPH_ALGO algo, + size_t threadsNum = 2); +#else + BlockHashIndex(size_t orderBits, size_t fingerPrintBits); + BlockHashIndex(size_t orderBits, size_t fingerPrintBits, CMPH_ALGO algo); +#endif + + ~BlockHashIndex(); + + size_t GetHash(const char* key); + size_t GetHash(std::string key); + + size_t operator[](std::string key); + size_t operator[](char* key); + + void BeginSave(std::FILE* mphf); + void SaveRange(size_t i); + void SaveLastRange(); + size_t FinalizeSave(); + +#ifdef WITH_THREADS + void WaitAll(); +#endif + + void DropRange(size_t i); + void DropLastRange(); + + size_t LoadIndex(std::FILE* mphf); + void LoadRange(size_t i); + + size_t Save(std::string filename); + size_t Save(std::FILE * mphf); + + size_t Load(std::string filename); + size_t Load(std::FILE * mphf); + + size_t GetSize() const; + + void KeepNLastRanges(float ratio = 0.1, float tolerance = 0.1); + + template + void AddRange(Keys &keys) + { + size_t current = m_landmarks.size(); + m_landmarks.push_back(keys[0]); + m_size += keys.size(); + +#ifdef WITH_THREADS + HashTask* ht = new HashTask(current, *this, keys); + m_threadPool.Submit(ht); +#else + CalcHash(current, keys); +#endif + } + + template + void CalcHash(size_t current, Keys &keys) + { + cmph_io_adapter_t *source = VectorAdapter(keys); + + cmph_config_t *config = cmph_config_new(source); + cmph_config_set_algo(config, m_algo); + + cmph_t* hash = cmph_new(config); + cmph_config_destroy(config); + + PairedPackedArray<> *pv = + new PairedPackedArray<>(keys.size(), m_orderBits, m_fingerPrintBits); + + size_t i = 0; + for(typename Keys::iterator it = keys.begin(); it != keys.end(); it++) + { + std::string temp = *it; + size_t fprint = GetFprint(temp.c_str()); + size_t idx = cmph_search(hash, temp.c_str(), + (cmph_uint32) temp.size()); + + pv->Set(idx, i, fprint, m_orderBits, m_fingerPrintBits); + i++; + } + +#ifdef WITH_THREADS + boost::mutex::scoped_lock lock(m_mutex); +#endif + + if(m_hashes.size() <= current) + { + m_hashes.resize(current + 1, 0); + m_arrays.resize(current + 1, 0); + m_clocks.resize(current + 1, 0); + } + + m_hashes[current] = hash; + m_arrays[current] = pv; + m_clocks[current] = clock(); + m_queue.push(-current); + } + + cmph_io_adapter_t* VectorAdapter(std::vector& v) + { + return CmphVectorAdapter(v); + } + + template class Allocator> + cmph_io_adapter_t* VectorAdapter(StringVector& sv) + { + return CmphStringVectorAdapter(sv); + } + +}; + +} +#endif \ No newline at end of file diff --git a/moses/src/CompactPT/CanonicalHuffman.h b/moses/src/CompactPT/CanonicalHuffman.h new file mode 100644 index 000000000..8eb92898b --- /dev/null +++ b/moses/src/CompactPT/CanonicalHuffman.h @@ -0,0 +1,322 @@ +#ifndef moses_CanonicalHuffman_h +#define moses_CanonicalHuffman_h + +#include +#include +#include +#include + +namespace Moses { + +template class Hufftree; + +template +class CanonicalHuffman +{ + private: + std::vector m_symbols; + + std::vector m_firstCodes; + std::vector m_lengthIndex; + + typedef boost::unordered_map > EncodeMap; + EncodeMap m_encodeMap; + + struct MinHeapSorter { + std::vector& m_vec; + + MinHeapSorter(std::vector& vec) : m_vec(vec) { } + + bool operator()(size_t a, size_t b) + { + return m_vec[a] > m_vec[b]; + } + }; + + template + void CalcLengths(Iterator begin, Iterator end, std::vector& lengths) + { + size_t n = std::distance(begin, end); + std::vector A(2 * n, 0); + + m_symbols.resize(n); + size_t i = 0; + for(Iterator it = begin; it != end; it++) + { + m_symbols[i] = it->first; + + A[i] = n + i; + A[n + i] = it->second; + i++; + } + + if(n == 1) + { + lengths.push_back(1); + return; + } + + MinHeapSorter hs(A); + std::make_heap(A.begin(), A.begin() + n, hs); + + size_t h = n; + size_t m1, m2; + while(h > 1) + { + m1 = A[0]; + std::pop_heap(A.begin(), A.begin() + h, hs); + + h--; + + m2 = A[0]; + std::pop_heap(A.begin(), A.begin() + h, hs); + + A[h] = A[m1] + A[m2]; + A[h-1] = h; + A[m1] = A[m2] = h; + + std::push_heap(A.begin(), A.begin() + h, hs); + } + + A[1] = 0; + for(size_t i = 2; i < 2*n; i++) + A[i] = A[A[i]] + 1; + + lengths.resize(n); + for(size_t i = 0; i < n; i++) + lengths[i] = A[i + n]; + } + + + void CalcCodes(std::vector& lengths) + { + std::vector numLength; + for(std::vector::iterator it = lengths.begin(); + it != lengths.end(); it++) { + size_t length = *it; + if(numLength.size() <= length) + numLength.resize(length + 1, 0); + numLength[length]++; + } + + m_lengthIndex.resize(numLength.size()); + m_lengthIndex[0] = 0; + for(size_t l = 1; l < numLength.size(); l++) + m_lengthIndex[l] = m_lengthIndex[l - 1] + numLength[l - 1]; + + size_t maxLength = numLength.size() - 1; + + m_firstCodes.resize(maxLength + 1, 0); + for(size_t l = maxLength - 1; l > 0; l--) + m_firstCodes[l] = (m_firstCodes[l + 1] + numLength[l + 1]) / 2; + + std::vector t_symbols; + t_symbols.resize(lengths.size()); + + std::vector nextCode = m_firstCodes; + for(size_t i = 0; i < lengths.size(); i++) + { + Data data = m_symbols[i]; + size_t length = lengths[i]; + + size_t pos = m_lengthIndex[length] + + (nextCode[length] - m_firstCodes[length]); + t_symbols[pos] = data; + + nextCode[length] = nextCode[length] + 1; + } + + m_symbols.swap(t_symbols); + } + + public: + + CanonicalHuffman(std::FILE* pFile, bool forEncoding = false) + { + Load(pFile); + + if(forEncoding) + CreateCodeMap(); + } + + template + CanonicalHuffman(Iterator begin, Iterator end, bool forEncoding = true) + { + std::vector lengths; + CalcLengths(begin, end, lengths); + CalcCodes(lengths); + + if(forEncoding) + CreateCodeMap(); + } + + void CreateCodeMap() + { + for(size_t l = 1; l < m_lengthIndex.size(); l++) + { + Code code = m_firstCodes[l]; + size_t num = ((l+1 < m_lengthIndex.size()) ? m_lengthIndex[l+1] + : m_symbols.size()) - m_lengthIndex[l]; + + for(size_t i = 0; i < num; i++) + { + Data data = m_symbols[m_lengthIndex[l] + i]; + boost::dynamic_bitset<> bitCode(l, code); + m_encodeMap[data] = bitCode; + code++; + } + } + } + + boost::dynamic_bitset<>& Encode(Data data) + { + return m_encodeMap[data]; + } + + template + Data NextSymbol(BitStream& bitStream) + { + if(bitStream.RemainingBits()) + { + Code code = bitStream.GetNext(); + size_t length = 1; + while(code < m_firstCodes[length]) + { + code = 2 * code + bitStream.GetNext(); + length++; + } + + size_t symbolIndex = m_lengthIndex[length] + + (code - m_firstCodes[length]); + return m_symbols[symbolIndex]; + } + return Data(); + } + + size_t Load(std::FILE* pFile) + { + size_t start = std::ftell(pFile); + + size_t size; + std::fread(&size, sizeof(size_t), 1, pFile); + m_symbols.resize(size); + std::fread(&m_symbols[0], sizeof(Data), size, pFile); + + std::fread(&size, sizeof(size_t), 1, pFile); + m_firstCodes.resize(size); + std::fread(&m_firstCodes[0], sizeof(Code), size, pFile); + + std::fread(&size, sizeof(size_t), 1, pFile); + m_lengthIndex.resize(size); + std::fread(&m_lengthIndex[0], sizeof(size_t), size, pFile); + + return std::ftell(pFile) - start; + } + + size_t Save(std::FILE* pFile) + { + size_t start = std::ftell(pFile); + + size_t size = m_symbols.size(); + std::fwrite(&size, sizeof(size_t), 1, pFile); + std::fwrite(&m_symbols[0], sizeof(Data), size, pFile); + + size = m_firstCodes.size(); + std::fwrite(&size, sizeof(size_t), 1, pFile); + std::fwrite(&m_firstCodes[0], sizeof(Code), size, pFile); + + size = m_lengthIndex.size(); + std::fwrite(&size, sizeof(size_t), 1, pFile); + std::fwrite(&m_lengthIndex[0], sizeof(size_t), size, pFile); + + return std::ftell(pFile) - start; + } + +}; + +template +class BitStream +{ + private: + Container& m_data; + + typename Container::iterator m_iterator; + typename Container::value_type m_currentValue; + + size_t m_valueBits; + typename Container::value_type m_mask; + size_t m_bitPos; + + public: + + BitStream(Container &data) + : m_data(data), m_iterator(m_data.begin()), + m_valueBits(sizeof(typename Container::value_type) * 8), + m_mask(1), m_bitPos(0) { } + + size_t RemainingBits() + { + if(m_data.size() * m_valueBits < m_bitPos) + return 0; + return m_data.size() * m_valueBits - m_bitPos; + } + + void SetLeft(size_t bitPos) + { + m_bitPos = m_data.size() * m_valueBits - bitPos; + m_iterator = m_data.begin() + int((m_bitPos-1)/m_valueBits); + m_currentValue = (*m_iterator) >> ((m_bitPos-1) % m_valueBits); + m_iterator++; + } + + bool GetNext() + { + if(m_bitPos % m_valueBits == 0) + { + if(m_iterator != m_data.end()) + { + m_currentValue = *m_iterator++; + } + } + else + { + m_currentValue = m_currentValue >> 1; + } + + m_bitPos++; + return (m_currentValue & m_mask); + } + + void PutCode(boost::dynamic_bitset<> code) + { + + for(int j = code.size()-1; j >= 0; j--) + { + if(m_bitPos % m_valueBits == 0) + { + m_data.push_back(0); + } + + if(code[j]) + m_data[m_data.size()-1] |= m_mask << (m_bitPos % m_valueBits); + + m_bitPos++; + } + + } + + void Reset() + { + m_iterator = m_data.begin(); + m_bitPos = 0; + } + + Container& GetContainer() + { + return m_data; + } +}; + +} + +#endif \ No newline at end of file diff --git a/moses/src/CompactPT/CmphStringVectorAdapter.cpp b/moses/src/CompactPT/CmphStringVectorAdapter.cpp new file mode 100644 index 000000000..60bb37f04 --- /dev/null +++ b/moses/src/CompactPT/CmphStringVectorAdapter.cpp @@ -0,0 +1,69 @@ +#include "CmphStringVectorAdapter.h" + +namespace Moses +{ + + void CmphStringVectorAdapterDispose(void *data, char *key, cmph_uint32 keylen) + { + delete[] key; + } + + void CmphStringVectorAdapterRewind(void *data) + { + cmph_vector_t *cmph_vector = (cmph_vector_t *)data; + cmph_vector->position = 0; + } + + //************************************************************************// + + cmph_io_adapter_t *CmphVectorAdapterNew(std::vector& v) + { + cmph_io_adapter_t * key_source = (cmph_io_adapter_t *)malloc(sizeof(cmph_io_adapter_t)); + cmph_vector_t * cmph_vector = (cmph_vector_t *)malloc(sizeof(cmph_vector_t)); + assert(key_source); + assert(cmph_vector); + + cmph_vector->vector = (void *)&v; + cmph_vector->position = 0; + key_source->data = (void *)cmph_vector; + key_source->nkeys = v.size(); + + return key_source; + } + + int CmphVectorAdapterRead(void *data, char **key, cmph_uint32 *keylen) + { + cmph_vector_t *cmph_vector = (cmph_vector_t *)data; + std::vector* v = (std::vector*)cmph_vector->vector; + size_t size; + *keylen = (*v)[cmph_vector->position].size(); + size = *keylen; + *key = new char[size + 1]; + std::string temp = (*v)[cmph_vector->position]; + strcpy(*key, temp.c_str()); + cmph_vector->position = cmph_vector->position + 1; + return (int)(*keylen); + } + + void CmphVectorAdapterDispose(void *data, char *key, cmph_uint32 keylen) + { + delete[] key; + } + + void CmphVectorAdapterRewind(void *data) + { + cmph_vector_t *cmph_vector = (cmph_vector_t *)data; + cmph_vector->position = 0; + } + + cmph_io_adapter_t* CmphVectorAdapter(std::vector& v) + { + cmph_io_adapter_t * key_source = CmphVectorAdapterNew(v); + + key_source->read = CmphVectorAdapterRead; + key_source->dispose = CmphVectorAdapterDispose; + key_source->rewind = CmphVectorAdapterRewind; + return key_source; + } + +} diff --git a/moses/src/CompactPT/CmphStringVectorAdapter.h b/moses/src/CompactPT/CmphStringVectorAdapter.h new file mode 100644 index 000000000..4d3f608da --- /dev/null +++ b/moses/src/CompactPT/CmphStringVectorAdapter.h @@ -0,0 +1,81 @@ +#ifndef moses_CmphStringVectorAdapterNew_h +#define moses_CmphStringVectorAdapterNew_h + +#include +#include + +#include "cmph/src/cmph.h" +#include "StringVector.h" + +namespace Moses +{ + typedef struct + { + void *vector; + cmph_uint32 position; + } + cmph_vector_t; + + + template class Allocator> + cmph_io_adapter_t *CmphStringVectorAdapterNew(StringVector& sv) + { + cmph_io_adapter_t * key_source = (cmph_io_adapter_t *)malloc(sizeof(cmph_io_adapter_t)); + cmph_vector_t * cmph_vector = (cmph_vector_t *)malloc(sizeof(cmph_vector_t)); + assert(key_source); + assert(cmph_vector); + + cmph_vector->vector = (void *)&sv; + cmph_vector->position = 0; + key_source->data = (void *)cmph_vector; + key_source->nkeys = sv.size(); + + return key_source; + } + + template class Allocator> + int CmphStringVectorAdapterRead(void *data, char **key, cmph_uint32 *keylen) + { + cmph_vector_t *cmph_vector = (cmph_vector_t *)data; + StringVector* sv = (StringVector*)cmph_vector->vector; + size_t size; + *keylen = (*sv)[cmph_vector->position].size(); + size = *keylen; + *key = new char[size + 1]; + std::string temp = (*sv)[cmph_vector->position]; + std::strcpy(*key, temp.c_str()); + cmph_vector->position = cmph_vector->position + 1; + return (int)(*keylen); + } + + void CmphStringVectorAdapterDispose(void *data, char *key, cmph_uint32 keylen); + + void CmphStringVectorAdapterRewind(void *data); + + template class Allocator> + cmph_io_adapter_t* CmphStringVectorAdapter(StringVector& sv) + { + cmph_io_adapter_t * key_source = CmphStringVectorAdapterNew(sv); + + key_source->read = CmphStringVectorAdapterRead; + key_source->dispose = CmphStringVectorAdapterDispose; + key_source->rewind = CmphStringVectorAdapterRewind; + return key_source; + } + + //************************************************************************// + + cmph_io_adapter_t *CmphVectorAdapterNew(std::vector& v); + + int CmphVectorAdapterRead(void *data, char **key, cmph_uint32 *keylen); + + void CmphVectorAdapterDispose(void *data, char *key, cmph_uint32 keylen); + + void CmphVectorAdapterRewind(void *data); + + cmph_io_adapter_t* CmphVectorAdapter(std::vector& v); + +} + + +#endif diff --git a/moses/src/CompactPT/ConsistantPhrases.h b/moses/src/CompactPT/ConsistantPhrases.h new file mode 100644 index 000000000..55472d4d7 --- /dev/null +++ b/moses/src/CompactPT/ConsistantPhrases.h @@ -0,0 +1,114 @@ +#ifndef moses_ConsistantPhrases_h +#define moses_ConsistantPhrases_h + +#include + +namespace Moses +{ + +class ConsistantPhrases +{ + public: + struct Phrase + { + int i, j, m, n; + Phrase(int i_, int m_, int j_, int n_) : i(i_), j(j_), m(m_), n(n_) { } + }; + + struct PhraseSorter + { + bool operator()(Phrase a, Phrase b) + { + if(a.j < b.j) + return true; + if(a.j == b.j && a.n > b.n) + return true; + if(a.j == b.j && a.n == b.n && a.i < b.i) + return true; + if(a.j == b.j && a.n == b.n && a.i == b.i && a.m > b.m) + return true; + /* + if(a.n > b.n) + return true; + if(a.n == b.n && a.j < b.j) + return true; + if(a.n == b.n && a.j == b.j && a.m > b.m) + return true; + if(a.n == b.n && a.j == b.j && a.m == b.m && a.i < b.i) + return true; + */ + return false; + } + }; + + private: + typedef std::set PhraseQueue; + PhraseQueue m_phraseQueue; + + public: + + template + ConsistantPhrases(int mmax, int nmax, It begin, It end) + { + for(int i = 0; i < mmax; i++) + { + for(int m = 1; m <= mmax-i; m++) + { + for(int j = 0; j < nmax; j++) + { + for(int n = 1; n <= nmax-j; n++) + { + bool consistant = true; + for(It it = begin; it != end; it++) + { + int ip = it->first; + int jp = it->second; + if((i <= ip && ip < i+m) != (j <= jp && jp < j+n)) + { + consistant = false; + break; + } + } + if(consistant) + m_phraseQueue.insert(Phrase(i, m, j, n)); + } + } + } + } + m_phraseQueue.erase(Phrase(0, mmax, 0, nmax)); + } + + size_t Size() + { + return m_phraseQueue.size(); + } + + Phrase Pop() + { + if(m_phraseQueue.size()) + { + Phrase p = *m_phraseQueue.begin(); + m_phraseQueue.erase(m_phraseQueue.begin()); + return p; + } + return Phrase(0,0,0,0); + } + + void RemoveOverlap(Phrase p) + { + PhraseQueue ok; + for(PhraseQueue::iterator it = m_phraseQueue.begin(); it != m_phraseQueue.end(); it++) + { + Phrase pp = *it; + if(!((p.i <= pp.i && pp.i < p.i + p.m) || (pp.i <= p.i && p.i < pp.i + pp.m) || + (p.j <= pp.j && pp.j < p.j + p.n) || (pp.j <= p.j && p.j < pp.j + pp.n))) + ok.insert(pp); + } + m_phraseQueue = ok; + } + +}; + +} + +#endif \ No newline at end of file diff --git a/moses/src/CompactPT/LexicalReorderingTableCompact.cpp b/moses/src/CompactPT/LexicalReorderingTableCompact.cpp new file mode 100644 index 000000000..ead06171e --- /dev/null +++ b/moses/src/CompactPT/LexicalReorderingTableCompact.cpp @@ -0,0 +1,135 @@ +#include "LexicalReorderingTableCompact.h" + +namespace Moses { + +LexicalReorderingTableCompact::LexicalReorderingTableCompact( + const std::string& filePath, + const std::vector& f_factors, + const std::vector& e_factors, + const std::vector& c_factors) + : LexicalReorderingTable(f_factors, e_factors, c_factors), + m_inMemory(StaticData::Instance().UseMinlexrInMemory()), + m_numScoreComponent(6), m_multipleScoreTrees(true), + m_hash(10, 16), m_scoreTrees(1, NULL) +{ + Load(filePath); +} + +LexicalReorderingTableCompact::LexicalReorderingTableCompact( + const std::vector& f_factors, + const std::vector& e_factors, + const std::vector& c_factors) + : LexicalReorderingTable(f_factors, e_factors, c_factors), + m_inMemory(StaticData::Instance().UseMinlexrInMemory()), + m_numScoreComponent(6), m_multipleScoreTrees(true), + m_hash(10, 16), m_scoreTrees(1, NULL) +{ } + +LexicalReorderingTableCompact::~LexicalReorderingTableCompact() { + for(size_t i = 0; i < m_scoreTrees.size(); i++) + delete m_scoreTrees[i]; +} + +std::vector LexicalReorderingTableCompact::GetScore(const Phrase& f, + const Phrase& e, + const Phrase& c) +{ + std::string key; + Scores scores; + + if(0 == c.GetSize()) + key = MakeKey(f, e, c); + else + for(size_t i = 0; i <= c.GetSize(); ++i) + { + Phrase sub_c(c.GetSubString(WordsRange(i,c.GetSize()-1))); + key = MakeKey(f,e,sub_c); + } + + size_t index = m_hash[key]; + if(m_hash.GetSize() != index) + { + std::string scoresString; + if(m_inMemory) + scoresString = m_scoresMemory[index]; + else + scoresString = m_scoresMapped[index]; + + BitStream<> bitStream(scoresString); + for(size_t i = 0; i < m_numScoreComponent; i++) + scores.push_back(m_scoreTrees[m_multipleScoreTrees ? i : 0]->NextSymbol(bitStream)); + + return scores; + } + + return Scores(); +} + +std::string LexicalReorderingTableCompact::MakeKey(const Phrase& f, + const Phrase& e, + const Phrase& c) const +{ + return MakeKey(Trim(f.GetStringRep(m_FactorsF)), + Trim(e.GetStringRep(m_FactorsE)), + Trim(c.GetStringRep(m_FactorsC))); +} + +std::string LexicalReorderingTableCompact::MakeKey(const std::string& f, + const std::string& e, + const std::string& c) const +{ + std::string key; + if(!f.empty()) + { + key += f; + } + if(!m_FactorsE.empty()) + { + if(!key.empty()) + { + key += " ||| "; + } + key += e; + } + if(!m_FactorsC.empty()) + { + if(!key.empty()) + { + key += " ||| "; + } + key += c; + } + key += " ||| "; + return key; +} + +void LexicalReorderingTableCompact::Load(std::string filePath) +{ + std::FILE* pFile = std::fopen(filePath.c_str(), "r"); + if(m_inMemory) + m_hash.Load(pFile); + else + m_hash.LoadIndex(pFile); + + std::fread(&m_numScoreComponent, sizeof(m_numScoreComponent), 1, pFile); + std::fread(&m_multipleScoreTrees, sizeof(m_multipleScoreTrees), 1, pFile); + + if(m_multipleScoreTrees) + { + m_scoreTrees.resize(m_numScoreComponent); + for(size_t i = 0; i < m_numScoreComponent; i++) + m_scoreTrees[i] = new CanonicalHuffman(pFile); + } + else + { + m_scoreTrees.resize(1); + m_scoreTrees[0] = new CanonicalHuffman(pFile); + } + + if(m_inMemory) + m_scoresMemory.load(pFile, false); + else + m_scoresMapped.load(pFile, true); +} + +} \ No newline at end of file diff --git a/moses/src/CompactPT/LexicalReorderingTableCompact.h b/moses/src/CompactPT/LexicalReorderingTableCompact.h new file mode 100644 index 000000000..0f58a2b7c --- /dev/null +++ b/moses/src/CompactPT/LexicalReorderingTableCompact.h @@ -0,0 +1,56 @@ +#ifndef moses_LexicalReorderingTableCompact_h +#define moses_LexicalReorderingTableCompact_h + +#include "LexicalReorderingTable.h" +#include "StaticData.h" +#include "PhraseDictionary.h" +#include "GenerationDictionary.h" +#include "TargetPhrase.h" +#include "TargetPhraseCollection.h" + +#include "CompactPT/BlockHashIndex.h" +#include "CompactPT/CanonicalHuffman.h" +#include "CompactPT/StringVector.h" + +namespace Moses { + +class LexicalReorderingTableCompact: public LexicalReorderingTable +{ + private: + bool m_inMemory; + + size_t m_numScoreComponent; + bool m_multipleScoreTrees; + + BlockHashIndex m_hash; + + typedef CanonicalHuffman ScoreTree; + std::vector m_scoreTrees; + + StringVector m_scoresMapped; + StringVector m_scoresMemory; + + std::string MakeKey(const Phrase& f, const Phrase& e, const Phrase& c) const; + std::string MakeKey(const std::string& f, const std::string& e, const std::string& c) const; + + public: + LexicalReorderingTableCompact( + const std::string& filePath, + const std::vector& f_factors, + const std::vector& e_factors, + const std::vector& c_factors); + + LexicalReorderingTableCompact( + const std::vector& f_factors, + const std::vector& e_factors, + const std::vector& c_factors); + + virtual ~LexicalReorderingTableCompact(); + + virtual std::vector GetScore(const Phrase& f, const Phrase& e, const Phrase& c); + void Load(std::string filePath); +}; + +} + +#endif \ No newline at end of file diff --git a/moses/src/CompactPT/LexicalReorderingTableCreator.cpp b/moses/src/CompactPT/LexicalReorderingTableCreator.cpp new file mode 100644 index 000000000..5e1e7c3ec --- /dev/null +++ b/moses/src/CompactPT/LexicalReorderingTableCreator.cpp @@ -0,0 +1,388 @@ +#include "LexicalReorderingTableCreator.h" + +namespace Moses { + +LexicalReorderingTableCreator::LexicalReorderingTableCreator( + std::string inPath, std::string outPath, size_t numScoreComponent, + size_t orderBits, size_t fingerPrintBits, bool multipleScoreTrees, + size_t quantize +#ifdef WITH_THREADS + , size_t threads +#endif + ) + : m_inPath(inPath), m_outPath(outPath), m_numScoreComponent(numScoreComponent), + m_orderBits(orderBits), m_fingerPrintBits(fingerPrintBits), + m_multipleScoreTrees(multipleScoreTrees), m_quantize(quantize), + m_separator(" ||| "), m_hash(m_orderBits, m_fingerPrintBits), + m_lastFlushedLine(-1) +#ifdef WITH_THREADS + , m_threads(threads) +#endif +{ + + m_scoreCounters.resize(m_multipleScoreTrees ? m_numScoreComponent : 1); + for(std::vector::iterator it = m_scoreCounters.begin(); + it != m_scoreCounters.end(); it++) + *it = new ScoreCounter(); + m_scoreTrees.resize(m_multipleScoreTrees ? m_numScoreComponent : 1); + + if(m_outPath.rfind(".mphlexr") != m_outPath.size() - 8) + m_outPath += ".mphlexr"; + + PrintInfo(); + + m_outFile = std::fopen(m_outPath.c_str(), "w"); + + std::cerr << "Pass 1/2: Creating phrase index + Counting scores" << std::endl; + m_hash.BeginSave(m_outFile); + EncodeScores(); + + std::cerr << "Intermezzo: Calculating Huffman code sets" << std::endl; + CalcHuffmanCodes(); + + std::cerr << "Pass 2/2: Compressing scores" << std::endl; + CompressScores(); + + std::cerr << "Saving to " << m_outPath << std::endl; + Save(); + std::cerr << "Done" << std::endl; + std::fclose(m_outFile); +} + +void LexicalReorderingTableCreator::PrintInfo() +{ + std::cerr << "Used options:" << std::endl; + std::cerr << "\tText reordering table will be read from: " << m_inPath << std::endl; + std::cerr << "\tOuput reordering table will be written to: " << m_outPath << std::endl; + std::cerr << "\tStep size for source landmark phrases: 2^" << m_orderBits << "=" << (1ul << m_orderBits) << std::endl; + std::cerr << "\tPhrase fingerprint size: " << m_fingerPrintBits << " bits / P(fp)=" << (float(1)/(1ul << m_fingerPrintBits)) << std::endl; + std::cerr << "\tNumber of score components in reordering table: " << m_numScoreComponent << std::endl; + std::cerr << "\tSingle Huffman code set for score components: " << (m_multipleScoreTrees ? "no" : "yes") << std::endl; + std::cerr << "\tUsing score quantization: "; + if(m_quantize) + std::cerr << m_quantize << " best" << std::endl; + else + std::cerr << "no" << std::endl; + +#ifdef WITH_THREADS + std::cerr << "\tRunning with " << m_threads << " threads" << std::endl; +#endif + std::cerr << std::endl; +} + +void LexicalReorderingTableCreator::EncodeScores() +{ + InputFileStream inFile(m_inPath); + +#ifdef WITH_THREADS + boost::thread_group threads; + for (size_t i = 0; i < m_threads; ++i) + { + EncodingTaskReordering* et = new EncodingTaskReordering(inFile, *this); + threads.create_thread(*et); + } + threads.join_all(); +#else + EncodingTaskReordering* et = new EncodingTaskReordering(inFile, *this); + (*et)(); + delete et; +#endif + FlushEncodedQueue(true); +} + +void LexicalReorderingTableCreator::CalcHuffmanCodes() +{ + std::vector::iterator treeIt = m_scoreTrees.begin(); + for(std::vector::iterator it = m_scoreCounters.begin(); + it != m_scoreCounters.end(); it++) + { + if(m_quantize) + (*it)->Quantize(m_quantize); + + std::cerr << "\tCreating Huffman codes for " << (*it)->Size() + << " scores" << std::endl; + + *treeIt = new ScoreTree((*it)->Begin(), (*it)->End()); + treeIt++; + } + std::cerr << std::endl; +} + +void LexicalReorderingTableCreator::CompressScores() +{ +#ifdef WITH_THREADS + boost::thread_group threads; + for (size_t i = 0; i < m_threads; ++i) { + CompressionTaskReordering* ct = new CompressionTaskReordering(m_encodedScores, *this); + threads.create_thread(*ct); + } + threads.join_all(); +#else + CompressionTaskReordering* ct = new CompressionTaskReordering(m_scores, *this); + (*ct)(); + delete ct; +#endif + FlushCompressedQueue(true); +} + +void LexicalReorderingTableCreator::Save() +{ + std::fwrite(&m_numScoreComponent, sizeof(m_numScoreComponent), 1, m_outFile); + std::fwrite(&m_multipleScoreTrees, sizeof(m_multipleScoreTrees), 1, m_outFile); + for(size_t i = 0; i < m_scoreTrees.size(); i++) + m_scoreTrees[i]->Save(m_outFile); + + m_compressedScores.save(m_outFile); +} + +std::string LexicalReorderingTableCreator::MakeSourceTargetKey(std::string &source, std::string &target) +{ + return source + m_separator + target + m_separator; +} + +std::string LexicalReorderingTableCreator::EncodeLine(std::vector& tokens) +{ + std::string scoresString = tokens[2]; + std::stringstream scoresStream; + + std::vector scores; + Tokenize(scores, scoresString); + + size_t c = 0; + float score; + while(c < m_numScoreComponent) + { + score = scores[c]; + score = FloorScore(TransformScore(score)); + scoresStream.write((char*)&score, sizeof(score)); + + m_scoreCounters[m_multipleScoreTrees ? c : 0]->Increase(score); + c++; + } + + return scoresStream.str(); +} + +void LexicalReorderingTableCreator::AddEncodedLine(PackedItem& pi) +{ + m_queue.push(pi); +} + +void LexicalReorderingTableCreator::FlushEncodedQueue(bool force) { + if(force || m_queue.size() > 10000) + { + while(!m_queue.empty() && m_lastFlushedLine + 1 == m_queue.top().GetLine()) + { + PackedItem pi = m_queue.top(); + m_queue.pop(); + m_lastFlushedLine++; + + m_lastRange.push_back(pi.GetSrc()); + m_encodedScores.push_back(pi.GetTrg()); + + if((pi.GetLine()+1) % 100000 == 0) + std::cerr << "."; + if((pi.GetLine()+1) % 5000000 == 0) + std::cerr << "[" << (pi.GetLine()+1) << "]" << std::endl; + + if(m_lastRange.size() == (1ul << m_orderBits)) + { + m_hash.AddRange(m_lastRange); + m_hash.SaveLastRange(); + m_hash.DropLastRange(); + m_lastRange.clear(); + } + } + } + + if(force) + { + m_lastFlushedLine = -1; + + m_hash.AddRange(m_lastRange); + m_lastRange.clear(); + +#ifdef WITH_THREADS + m_hash.WaitAll(); +#endif + + m_hash.SaveLastRange(); + m_hash.DropLastRange(); + m_hash.FinalizeSave(); + + std::cerr << std::endl << std::endl; + } +} + +std::string LexicalReorderingTableCreator::CompressEncodedScores(std::string &encodedScores) { + std::stringstream encodedScoresStream(encodedScores); + encodedScoresStream.unsetf(std::ios::skipws); + + std::string compressedScores; + BitStream<> compressedScoresStream(compressedScores); + + size_t currScore = 0; + float score; + encodedScoresStream.read((char*) &score, sizeof(score)); + + while(encodedScoresStream) { + size_t index = currScore % m_scoreTrees.size(); + + if(m_quantize) + score = m_scoreCounters[index]->LowerBound(score); + + compressedScoresStream.PutCode(m_scoreTrees[index]->Encode(score)); + encodedScoresStream.read((char*) &score, sizeof(score)); + currScore++; + } + + return compressedScores; +} + +void LexicalReorderingTableCreator::AddCompressedScores(PackedItem& pi) { + m_queue.push(pi); +} + +void LexicalReorderingTableCreator::FlushCompressedQueue(bool force) +{ + if(force || m_queue.size() > 10000) + { + while(!m_queue.empty() && m_lastFlushedLine + 1 == m_queue.top().GetLine()) + { + PackedItem pi = m_queue.top(); + m_queue.pop(); + m_lastFlushedLine++; + + m_compressedScores.push_back(pi.GetTrg()); + + if((pi.GetLine()+1) % 100000 == 0) + std::cerr << "."; + if((pi.GetLine()+1) % 5000000 == 0) + std::cerr << "[" << (pi.GetLine()+1) << "]" << std::endl; + } + } + + if(force) + { + m_lastFlushedLine = -1; + std::cerr << std::endl << std::endl; + } +} + +//****************************************************************************// + +size_t EncodingTaskReordering::m_lineNum = 0; +#ifdef WITH_THREADS +boost::mutex EncodingTaskReordering::m_mutex; +boost::mutex EncodingTaskReordering::m_fileMutex; +#endif + +EncodingTaskReordering::EncodingTaskReordering(InputFileStream& inFile, LexicalReorderingTableCreator& creator) + : m_inFile(inFile), m_creator(creator) {} + +void EncodingTaskReordering::operator()() +{ + size_t lineNum = 0; + + std::vector lines; + size_t max_lines = 1000; + lines.reserve(max_lines); + + { +#ifdef WITH_THREADS + boost::mutex::scoped_lock lock(m_fileMutex); +#endif + std::string line; + while(lines.size() < max_lines && std::getline(m_inFile, line)) + lines.push_back(line); + lineNum = m_lineNum; + m_lineNum += lines.size(); + } + + std::vector result; + result.reserve(max_lines); + + while(lines.size()) + { + for(size_t i = 0; i < lines.size(); i++) + { + std::vector tokens; + Moses::TokenizeMultiCharSeparator(tokens, lines[i], m_creator.m_separator); + + std::string encodedLine = m_creator.EncodeLine(tokens); + + PackedItem packedItem(lineNum + i, m_creator.MakeSourceTargetKey(tokens[0], tokens[1]), + encodedLine, i); + result.push_back(packedItem); + } + lines.clear(); + + { +#ifdef WITH_THREADS + boost::mutex::scoped_lock lock(m_mutex); +#endif + for(size_t i = 0; i < result.size(); i++) + m_creator.AddEncodedLine(result[i]); + m_creator.FlushEncodedQueue(); + } + + result.clear(); + lines.reserve(max_lines); + result.reserve(max_lines); + +#ifdef WITH_THREADS + boost::mutex::scoped_lock lock(m_fileMutex); +#endif + std::string line; + while(lines.size() < max_lines && std::getline(m_inFile, line)) + lines.push_back(line); + lineNum = m_lineNum; + m_lineNum += lines.size(); + } +} + +//****************************************************************************// + +size_t CompressionTaskReordering::m_scoresNum = 0; +#ifdef WITH_THREADS +boost::mutex CompressionTaskReordering::m_mutex; +#endif + +CompressionTaskReordering::CompressionTaskReordering(StringVector& encodedScores, + LexicalReorderingTableCreator& creator) + : m_encodedScores(encodedScores), m_creator(creator) +{ } + +void CompressionTaskReordering::operator()() +{ + size_t scoresNum; + { +#ifdef WITH_THREADS + boost::mutex::scoped_lock lock(m_mutex); +#endif + scoresNum = m_scoresNum; + m_scoresNum++; + } + + while(scoresNum < m_encodedScores.size()) + { + std::string scores = m_encodedScores[scoresNum]; + std::string compressedScores + = m_creator.CompressEncodedScores(scores); + + std::string dummy; + PackedItem packedItem(scoresNum, dummy, compressedScores, 0); + +#ifdef WITH_THREADS + boost::mutex::scoped_lock lock(m_mutex); +#endif + m_creator.AddCompressedScores(packedItem); + m_creator.FlushCompressedQueue(); + + scoresNum = m_scoresNum; + m_scoresNum++; + } +} + +} diff --git a/moses/src/CompactPT/LexicalReorderingTableCreator.h b/moses/src/CompactPT/LexicalReorderingTableCreator.h new file mode 100644 index 000000000..a6968d063 --- /dev/null +++ b/moses/src/CompactPT/LexicalReorderingTableCreator.h @@ -0,0 +1,117 @@ +#ifndef moses_LexicalReorderingTableCreator_h +#define moses_LexicalReorderingTableCreator_h + +#include "PhraseTableCreator.h" + +namespace Moses { + +class LexicalReorderingTableCreator { + private: + std::string m_inPath; + std::string m_outPath; + + std::FILE* m_outFile; + + size_t m_orderBits; + size_t m_fingerPrintBits; + + size_t m_numScoreComponent; + + bool m_multipleScoreTrees; + bool m_quantize; + + std::string m_separator; + + BlockHashIndex m_hash; + +#ifdef WITH_THREADS + size_t m_threads; +#endif + + typedef Counter ScoreCounter; + typedef CanonicalHuffman ScoreTree; + + std::vector m_scoreCounters; + std::vector m_scoreTrees; + + StringVector m_encodedScores; + StringVector m_compressedScores; + + std::priority_queue m_queue; + long m_lastFlushedLine; + long m_lastFlushedSourceNum; + std::string m_lastFlushedSourcePhrase; + std::vector m_lastRange; + + void PrintInfo(); + + void EncodeScores(); + void CalcHuffmanCodes(); + void CompressScores(); + void Save(); + + std::string MakeSourceTargetKey(std::string&, std::string&); + + std::string EncodeLine(std::vector& tokens); + void AddEncodedLine(PackedItem& pi); + void FlushEncodedQueue(bool force = false); + + std::string CompressEncodedScores(std::string &encodedScores); + void AddCompressedScores(PackedItem& pi); + void FlushCompressedQueue(bool force = false); + + public: + LexicalReorderingTableCreator(std::string inPath, + std::string outPath, + size_t numScoreComponent = 6, + size_t orderBits = 10, + size_t fingerPrintBits = 16, + bool multipleScoreTrees = true, + size_t quantize = 0 +#ifdef WITH_THREADS + , size_t threads = 2 +#endif + ); + + friend class EncodingTaskReordering; + friend class CompressionTaskReordering; +}; + +class EncodingTaskReordering +{ + private: +#ifdef WITH_THREADS + static boost::mutex m_mutex; + static boost::mutex m_fileMutex; +#endif + static size_t m_lineNum; + static size_t m_sourcePhraseNum; + static std::string m_lastSourcePhrase; + + InputFileStream& m_inFile; + LexicalReorderingTableCreator& m_creator; + + public: + EncodingTaskReordering(InputFileStream& inFile, LexicalReorderingTableCreator& creator); + void operator()(); +}; + +class CompressionTaskReordering +{ + private: +#ifdef WITH_THREADS + static boost::mutex m_mutex; +#endif + static size_t m_scoresNum; + StringVector &m_encodedScores; + LexicalReorderingTableCreator &m_creator; + + public: + CompressionTaskReordering(StringVector& + m_encodedScores, LexicalReorderingTableCreator& creator); + void operator()(); +}; + +} + +#endif diff --git a/moses/src/CompactPT/ListCoders.h b/moses/src/CompactPT/ListCoders.h new file mode 100644 index 000000000..6ee6163b6 --- /dev/null +++ b/moses/src/CompactPT/ListCoders.h @@ -0,0 +1,291 @@ +#ifndef moses_ListCoders_h +#define moses_ListCoders_h + +#include +#include + +namespace Moses +{ + +template +class VarIntType +{ + private: + template + static void EncodeSymbol(IntType input, OutIt output) + { + if(input == 0) + { + *output = 0; + output++; + return; + } + + T msb = 1 << (sizeof(T)*8-1); + IntType mask = ~msb; + IntType shift = (sizeof(T)*8-1); + + while(input) + { + T res = input & mask; + input >>= shift; + if(input) + res |= msb; + *output = res; + output++; + } + }; + + template + static void DecodeSymbol(InIt &it, InIt end, IntType &output) + { + T msb = 1 << (sizeof(T)*8-1); + IntType shift = (sizeof(T)*8-1); + + output = 0; + size_t i = 0; + while(it != end && *it & msb) { + IntType temp = *it & ~msb; + temp <<= shift*i; + output |= temp; + it++; i++; + } + assert(it != end); + + IntType temp = *it; + temp <<= shift*i; + output |= temp; + it++; + } + + + + public: + + template + static void Encode(InIt it, InIt end, OutIt outIt) + { + while(it != end) + { + EncodeSymbol(*it, outIt); + it++; + } + } + + + template + static void Decode(InIt &it, InIt end, OutIt outIt) + { + while(it != end) + { + size_t output; + DecodeSymbol(it, end, output); + *outIt = output; + outIt++; + } + } + + template + static size_t DecodeAndSum(InIt &it, InIt end, size_t num) + { + size_t sum = 0; + size_t curr = 0; + + while(it != end && curr < num) + { + size_t output; + DecodeSymbol(it, end, output); + sum += output; curr++; + } + + return sum; + } + +}; + +typedef VarIntType VarByte; + +typedef VarByte VarInt8; +typedef VarIntType VarInt16; +typedef VarIntType VarInt32; + +class Simple9 +{ + private: + typedef unsigned int uint; + + template + inline static void EncodeSymbol(uint &output, InIt it, InIt end) + { + uint length = end - it; + + uint type; + uint bitlength; + + switch(length) + { + case 1: type = 1; bitlength = 28; break; + case 2: type = 2; bitlength = 14; break; + case 3: type = 3; bitlength = 9; break; + case 4: type = 4; bitlength = 7; break; + case 5: type = 5; bitlength = 5; break; + case 7: type = 6; bitlength = 4; break; + case 9: type = 7; bitlength = 3; break; + case 14: type = 8; bitlength = 2; break; + case 28: type = 9; bitlength = 1; break; + } + + output = 0; + output |= (type << 28); + + uint i = 0; + while(it != end) + { + uint l = bitlength * (length-i-1); + output |= *it << l; + it++; + i++; + } + } + + template + static inline void DecodeSymbol(uint input, OutIt outIt) + { + uint type = (input >> 28); + + uint bitlen; + uint shift; + uint mask; + + switch(type) + { + case 1: bitlen = 28; shift = 0; mask = 268435455; break; + case 2: bitlen = 14; shift = 14; mask = 16383; break; + case 3: bitlen = 9; shift = 18; mask = 511; break; + case 4: bitlen = 7; shift = 21; mask = 127; break; + case 5: bitlen = 5; shift = 20; mask = 31; break; + case 6: bitlen = 4; shift = 24; mask = 15; break; + case 7: bitlen = 3; shift = 24; mask = 7; break; + case 8: bitlen = 2; shift = 26; mask = 3; break; + case 9: bitlen = 1; shift = 27; mask = 1; break; + } + + while(shift > 0) + { + *outIt = (input >> shift) & mask; + shift -= bitlen; + outIt++; + } + *outIt = input & mask; + outIt++; + } + + static inline size_t DecodeAndSumSymbol(uint input, size_t num, size_t &curr) + { + uint type = (input >> 28); + + uint bitlen; + uint shift; + uint mask; + + switch(type) + { + case 1: bitlen = 28; shift = 0; mask = 268435455; break; + case 2: bitlen = 14; shift = 14; mask = 16383; break; + case 3: bitlen = 9; shift = 18; mask = 511; break; + case 4: bitlen = 7; shift = 21; mask = 127; break; + case 5: bitlen = 5; shift = 20; mask = 31; break; + case 6: bitlen = 4; shift = 24; mask = 15; break; + case 7: bitlen = 3; shift = 24; mask = 7; break; + case 8: bitlen = 2; shift = 26; mask = 3; break; + case 9: bitlen = 1; shift = 27; mask = 1; break; + } + + size_t sum = 0; + while(shift > 0) + { + sum += (input >> shift) & mask; + shift -= bitlen; + if(++curr == num) + return sum; + } + sum += input & mask; + curr++; + return sum; + } + + public: + template + static void Encode(InIt it, InIt end, OutIt outIt) + { + uint parts[] = { 1, 2, 3, 4, 5, 7, 9, 14, 28 }; + + uint buffer[28]; + for(InIt i = it; i < end; i++) + { + uint lastbit = 1; + uint lastpos = 0; + uint lastyes = 0; + uint j = 0; + + double log2 = log(2); + while(j < 9 && lastpos < 28 && (i+lastpos) < end) + { + if(lastpos >= parts[j]) + j++; + + buffer[lastpos] = *(i + lastpos); + + uint reqbit = ceil(log(buffer[lastpos]+1)/log2); + assert(reqbit <= 28); + + uint bit = 28/floor(28/reqbit); + if(lastbit < bit) + lastbit = bit; + + if(parts[j] > 28/lastbit) + break; + else if(lastpos == parts[j]-1) + lastyes = lastpos; + + lastpos++; + } + i += lastyes; + + uint length = lastyes + 1; + uint output; + EncodeSymbol(output, buffer, buffer + length); + + *outIt = output; + outIt++; + } + } + + template + static void Decode(InIt &it, InIt end, OutIt outIt) + { + while(it != end) + { + DecodeSymbol(*it, outIt); + it++; + } + } + + template + static size_t DecodeAndSum(InIt &it, InIt end, size_t num) + { + size_t sum = 0; + size_t curr = 0; + while(it != end && curr < num) + { + sum += DecodeAndSumSymbol(*it, num, curr); + it++; + } + assert(curr == num); + return sum; + } +}; + +} + +#endif \ No newline at end of file diff --git a/moses/src/CompactPT/MmapAllocator.h b/moses/src/CompactPT/MmapAllocator.h new file mode 100644 index 000000000..c3933cfdf --- /dev/null +++ b/moses/src/CompactPT/MmapAllocator.h @@ -0,0 +1,175 @@ +#ifndef moses_MmapAllocator_h +#define moses_MmapAllocator_h + +#include +#include +#include +#include +#include + +namespace Moses +{ + template + class MmapAllocator + { + protected: + std::FILE* m_file_ptr; + size_t m_file_desc; + + size_t m_page_size; + size_t m_map_size; + + char* m_data_ptr; + size_t m_data_offset; + bool m_fixed; + + public: + typedef T value_type; + typedef T* pointer; + typedef const T* const_pointer; + typedef T& reference; + typedef const T& const_reference; + typedef std::size_t size_type; + typedef std::ptrdiff_t difference_type; + + MmapAllocator() throw() + : m_file_ptr(std::tmpfile()), m_file_desc(fileno(m_file_ptr)), + m_page_size(sysconf(_SC_PAGE_SIZE)), m_map_size(0), m_data_ptr(0), + m_data_offset(0), m_fixed(false) + { } + + MmapAllocator(std::FILE* f_ptr) throw() + : m_file_ptr(f_ptr), m_file_desc(fileno(m_file_ptr)), + m_page_size(sysconf(_SC_PAGE_SIZE)), m_map_size(0), m_data_ptr(0), + m_data_offset(0), m_fixed(false) + { } + + MmapAllocator(std::FILE* f_ptr, size_t data_offset = 0) throw() + : m_file_ptr(f_ptr), m_file_desc(fileno(m_file_ptr)), + m_page_size(sysconf(_SC_PAGE_SIZE)), m_map_size(0), m_data_ptr(0), + m_data_offset(data_offset), m_fixed(true) + { } + + MmapAllocator(std::string fileName) throw() + : m_file_ptr(std::fopen(fileName.c_str(), "wb+")), m_file_desc(fileno(m_file_ptr)), + m_page_size(sysconf(_SC_PAGE_SIZE)), m_map_size(0), m_data_ptr(0), + m_data_offset(0), m_fixed(false) + { } + + MmapAllocator(const MmapAllocator& c) throw() + : m_file_ptr(c.m_file_ptr), m_file_desc(c.m_file_desc), + m_page_size(c.m_page_size), m_map_size(c.m_map_size), + m_data_ptr(c.m_data_ptr), m_data_offset(c.m_data_offset), + m_fixed(c.m_fixed) + { } + + ~MmapAllocator() throw() + { + if(m_data_ptr) + { + munmap(m_data_ptr, m_map_size); + if(!m_fixed && std::ftell(m_file_ptr) != -1) + std::fclose(m_file_ptr); + } + } + + template + struct rebind { + typedef MmapAllocator other; + }; + + pointer address (reference value) const + { + return &value; + } + + const_pointer address (const_reference value) const + { + return &value; + } + + size_type max_size () const throw() + { + return std::numeric_limits::max() / sizeof(value_type); + } + + pointer allocate (size_type num, const void* = 0) + { + m_map_size = num * sizeof(T); + + if(!m_fixed) + { + ftruncate(m_file_desc, m_map_size); + m_data_ptr = (char*)mmap(0, m_map_size, PROT_READ|PROT_WRITE, MAP_SHARED, + m_file_desc, 0); + return (pointer)m_data_ptr; + } + else + { + size_t map_offset = (m_data_offset / m_page_size) * m_page_size; + size_t relative_offset = m_data_offset - map_offset; + + size_t map_size = m_map_size + relative_offset; + + m_data_ptr = (char*)mmap(0, map_size, PROT_READ, MAP_SHARED, + m_file_desc, map_offset); + + return (pointer)(m_data_ptr + relative_offset); + } + } + + void deallocate (pointer p, size_type num) + { + if(!m_fixed) + munmap(p, num * sizeof(T)); + else { + size_t map_offset = (m_data_offset / m_page_size) * m_page_size; + size_t relative_offset = m_data_offset - map_offset; + munmap((pointer)((char*)p - relative_offset), num * sizeof(T)); + } + + } + + void construct (pointer p, const T& value) + { + if(!m_fixed) + new(p) value_type(value); + } + void destroy (pointer p) + { + if(!m_fixed) + p->~T(); + } + + template + friend bool operator== (const MmapAllocator&, const MmapAllocator&) throw(); + + template + friend bool operator!= (const MmapAllocator&, const MmapAllocator&) throw(); + }; + + template + bool operator== (const MmapAllocator& a1, + const MmapAllocator& a2) throw() + { + bool equal = true; + equal &= a1.m_file_ptr == a2.m_file_ptr; + equal &= a1.m_file_desc == a2.m_file_desc; + equal &= a1.m_page_size == a2.m_page_size; + equal &= a1.m_map_size == a2.m_map_size; + equal &= a1.m_data_ptr == a2.m_data_ptr; + equal &= a1.m_data_offset == a2.m_data_offset; + equal &= a1.m_fixed == a2.m_fixed; + return equal; + } + + template + bool operator!=(const MmapAllocator& a1, + const MmapAllocator& a2) throw() + { + return !(a1 == a2); + } + +} + +#endif \ No newline at end of file diff --git a/moses/src/CompactPT/MonotonicVector.h b/moses/src/CompactPT/MonotonicVector.h new file mode 100644 index 000000000..f613306bd --- /dev/null +++ b/moses/src/CompactPT/MonotonicVector.h @@ -0,0 +1,227 @@ +#ifndef moses_MonotonicVector_h +#define moses_MonotonicVector_h + +// MonotonicVector - Represents a monotonic increasing function that maps +// positive integers of any size onto a given number type. Each value has to be +// equal or larger than the previous one. Depending on the stepSize it can save +// up to 90% of memory compared to a std::vector. Time complexity is roughly +// constant, in the worst case, however, stepSize times slower than a normal +// std::vector. + +#include +#include +#include +#include +#include + +#include "ListCoders.h" +#include "MmapAllocator.h" + +namespace Moses +{ + +template class Allocator = std::allocator> +class MonotonicVector +{ + private: + typedef std::vector > Anchors; + typedef std::vector > Diffs; + + Anchors m_anchors; + Diffs m_diffs; + std::vector m_tempDiffs; + + size_t m_size; + PosT m_last; + bool m_final; + + public: + typedef PosT value_type; + + MonotonicVector() : m_size(0), m_last(0), m_final(false) {} + + size_t size() const + { + return m_size + m_tempDiffs.size(); + } + + PosT at(size_t i) const + { + PosT s = stepSize; + PosT j = m_anchors[i / s]; + PosT r = i % s; + + typename Diffs::const_iterator it = m_diffs.begin() + j; + + PosT k = 0; + k += VarInt32::DecodeAndSum(it, m_diffs.end(), 1); + if(i < m_size) + k += Simple9::DecodeAndSum(it, m_diffs.end(), r); + else if(i < m_size + m_tempDiffs.size()) + for(size_t l = 0; l < r; l++) + k += m_tempDiffs[l]; + + return k; + } + + PosT operator[](PosT i) const + { + return at(i); + } + + PosT back() const + { + return at(size()-1); + } + + void push_back(PosT i) + { + assert(m_final != true); + + if(m_anchors.size() == 0 && m_tempDiffs.size() == 0) + { + m_anchors.push_back(0); + VarInt32::Encode(&i, &i+1, std::back_inserter(m_diffs)); + m_last = i; + m_size++; + + return; + } + + if(m_tempDiffs.size() == stepSize-1) + { + Simple9::Encode(m_tempDiffs.begin(), m_tempDiffs.end(), + std::back_inserter(m_diffs)); + m_anchors.push_back(m_diffs.size()); + VarInt32::Encode(&i, &i+1, std::back_inserter(m_diffs)); + + m_size += m_tempDiffs.size() + 1; + m_tempDiffs.clear(); + } + else + { + PosT last = m_last; + PosT diff = i - last; + m_tempDiffs.push_back(diff); + } + m_last = i; + } + + void commit() + { + assert(m_final != true); + Simple9::Encode(m_tempDiffs.begin(), m_tempDiffs.end(), + std::back_inserter(m_diffs)); + m_size += m_tempDiffs.size(); + m_tempDiffs.clear(); + m_final = true; + } + + size_t usage() + { + return m_diffs.size() * sizeof(unsigned int) + + m_anchors.size() * sizeof(NumT); + } + + size_t load(std::FILE* in, bool map = false) + { + size_t byteSize = 0; + + byteSize += fread(&m_final, sizeof(bool), 1, in) * sizeof(bool); + byteSize += fread(&m_size, sizeof(size_t), 1, in) * sizeof(size_t); + byteSize += fread(&m_last, sizeof(PosT), 1, in) * sizeof(PosT); + + byteSize += loadVector(m_diffs, in, map); + byteSize += loadVector(m_anchors, in, map); + + return byteSize; + } + + template + size_t loadVector(std::vector >& v, + std::FILE* in, bool map = false) + { + // Can only be read into memory. Mapping not possible with std:allocator. + assert(map == false); + + size_t byteSize = 0; + + size_t valSize; + byteSize += std::fread(&valSize, sizeof(size_t), 1, in) * sizeof(size_t); + + v.resize(valSize, 0); + byteSize += std::fread(&v[0], sizeof(ValueT), valSize, in) * sizeof(ValueT); + + return byteSize; + } + + template + size_t loadVector(std::vector >& v, + std::FILE* in, bool map = false) + { + size_t byteSize = 0; + + size_t valSize; + byteSize += std::fread(&valSize, sizeof(size_t), 1, in) * sizeof(size_t); + + if(map == false) + { + // Read data into temporary file (default constructor of MmapAllocator) + // and map memory onto temporary file. Can be resized. + + v.resize(valSize, 0); + byteSize += std::fread(&v[0], sizeof(ValueT), valSize, in) * sizeof(ValueT); + } + else + { + // Map it directly on specified region of file "in" starting at valPos + // with length valSize * sizeof(ValueT). Mapped region cannot be resized. + + size_t valPos = std::ftell(in); + + Allocator alloc(in, valPos); + std::vector > vTemp(alloc); + vTemp.resize(valSize); + v.swap(vTemp); + + std::fseek(in, valSize * sizeof(ValueT), SEEK_CUR); + byteSize += valSize * sizeof(ValueT); + } + + return byteSize; + } + + size_t save(std::FILE* out) + { + if(!m_final) + commit(); + + bool byteSize = 0; + byteSize += fwrite(&m_final, sizeof(bool), 1, out) * sizeof(bool); + byteSize += fwrite(&m_size, sizeof(size_t), 1, out) * sizeof(size_t); + byteSize += fwrite(&m_last, sizeof(PosT), 1, out) * sizeof(PosT); + + size_t size = m_diffs.size(); + byteSize += fwrite(&size, sizeof(size_t), 1, out) * sizeof(size_t); + byteSize += fwrite(&m_diffs[0], sizeof(unsigned int), size, out) * sizeof(unsigned int); + + size = m_anchors.size(); + byteSize += fwrite(&size, sizeof(size_t), 1, out) * sizeof(size_t); + byteSize += fwrite(&m_anchors[0], sizeof(NumT), size, out) * sizeof(NumT); + + return byteSize; + } + + void swap(MonotonicVector &mv) + { + if(!m_final) + commit(); + + m_diffs.swap(mv.m_diffs); + m_anchors.swap(mv.m_anchors); + } +}; + +} +#endif diff --git a/moses/src/CompactPT/MurmurHash3.cpp b/moses/src/CompactPT/MurmurHash3.cpp new file mode 100644 index 000000000..0bf738662 --- /dev/null +++ b/moses/src/CompactPT/MurmurHash3.cpp @@ -0,0 +1,335 @@ +//----------------------------------------------------------------------------- +// MurmurHash3 was written by Austin Appleby, and is placed in the public +// domain. The author hereby disclaims copyright to this source code. + +// Note - The x86 and x64 versions do _not_ produce the same results, as the +// algorithms are optimized for their respective platforms. You can still +// compile and run any of them on any platform, but your performance with the +// non-native version will be less than optimal. + +#include "MurmurHash3.h" + +//----------------------------------------------------------------------------- +// Platform-specific functions and macros + +// Microsoft Visual Studio + +#if defined(_MSC_VER) + +#define FORCE_INLINE __forceinline + +#include + +#define ROTL32(x,y) _rotl(x,y) +#define ROTL64(x,y) _rotl64(x,y) + +#define BIG_CONSTANT(x) (x) + +// Other compilers + +#else // defined(_MSC_VER) + +#define FORCE_INLINE __attribute__((always_inline)) + +inline uint32_t rotl32 ( uint32_t x, int8_t r ) +{ + return (x << r) | (x >> (32 - r)); +} + +inline uint64_t rotl64 ( uint64_t x, int8_t r ) +{ + return (x << r) | (x >> (64 - r)); +} + +#define ROTL32(x,y) rotl32(x,y) +#define ROTL64(x,y) rotl64(x,y) + +#define BIG_CONSTANT(x) (x##LLU) + +#endif // !defined(_MSC_VER) + +//----------------------------------------------------------------------------- +// Block read - if your platform needs to do endian-swapping or can only +// handle aligned reads, do the conversion here + +FORCE_INLINE uint32_t getblock ( const uint32_t * p, int i ) +{ + return p[i]; +} + +FORCE_INLINE uint64_t getblock ( const uint64_t * p, int i ) +{ + return p[i]; +} + +//----------------------------------------------------------------------------- +// Finalization mix - force all bits of a hash block to avalanche + +FORCE_INLINE uint32_t fmix ( uint32_t h ) +{ + h ^= h >> 16; + h *= 0x85ebca6b; + h ^= h >> 13; + h *= 0xc2b2ae35; + h ^= h >> 16; + + return h; +} + +//---------- + +FORCE_INLINE uint64_t fmix ( uint64_t k ) +{ + k ^= k >> 33; + k *= BIG_CONSTANT(0xff51afd7ed558ccd); + k ^= k >> 33; + k *= BIG_CONSTANT(0xc4ceb9fe1a85ec53); + k ^= k >> 33; + + return k; +} + +//----------------------------------------------------------------------------- + +void MurmurHash3_x86_32 ( const void * key, int len, + uint32_t seed, void * out ) +{ + const uint8_t * data = (const uint8_t*)key; + const int nblocks = len / 4; + + uint32_t h1 = seed; + + uint32_t c1 = 0xcc9e2d51; + uint32_t c2 = 0x1b873593; + + //---------- + // body + + const uint32_t * blocks = (const uint32_t *)(data + nblocks*4); + + for(int i = -nblocks; i; i++) + { + uint32_t k1 = getblock(blocks,i); + + k1 *= c1; + k1 = ROTL32(k1,15); + k1 *= c2; + + h1 ^= k1; + h1 = ROTL32(h1,13); + h1 = h1*5+0xe6546b64; + } + + //---------- + // tail + + const uint8_t * tail = (const uint8_t*)(data + nblocks*4); + + uint32_t k1 = 0; + + switch(len & 3) + { + case 3: k1 ^= tail[2] << 16; + case 2: k1 ^= tail[1] << 8; + case 1: k1 ^= tail[0]; + k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1; + }; + + //---------- + // finalization + + h1 ^= len; + + h1 = fmix(h1); + + *(uint32_t*)out = h1; +} + +//----------------------------------------------------------------------------- + +void MurmurHash3_x86_128 ( const void * key, const int len, + uint32_t seed, void * out ) +{ + const uint8_t * data = (const uint8_t*)key; + const int nblocks = len / 16; + + uint32_t h1 = seed; + uint32_t h2 = seed; + uint32_t h3 = seed; + uint32_t h4 = seed; + + uint32_t c1 = 0x239b961b; + uint32_t c2 = 0xab0e9789; + uint32_t c3 = 0x38b34ae5; + uint32_t c4 = 0xa1e38b93; + + //---------- + // body + + const uint32_t * blocks = (const uint32_t *)(data + nblocks*16); + + for(int i = -nblocks; i; i++) + { + uint32_t k1 = getblock(blocks,i*4+0); + uint32_t k2 = getblock(blocks,i*4+1); + uint32_t k3 = getblock(blocks,i*4+2); + uint32_t k4 = getblock(blocks,i*4+3); + + k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1; + + h1 = ROTL32(h1,19); h1 += h2; h1 = h1*5+0x561ccd1b; + + k2 *= c2; k2 = ROTL32(k2,16); k2 *= c3; h2 ^= k2; + + h2 = ROTL32(h2,17); h2 += h3; h2 = h2*5+0x0bcaa747; + + k3 *= c3; k3 = ROTL32(k3,17); k3 *= c4; h3 ^= k3; + + h3 = ROTL32(h3,15); h3 += h4; h3 = h3*5+0x96cd1c35; + + k4 *= c4; k4 = ROTL32(k4,18); k4 *= c1; h4 ^= k4; + + h4 = ROTL32(h4,13); h4 += h1; h4 = h4*5+0x32ac3b17; + } + + //---------- + // tail + + const uint8_t * tail = (const uint8_t*)(data + nblocks*16); + + uint32_t k1 = 0; + uint32_t k2 = 0; + uint32_t k3 = 0; + uint32_t k4 = 0; + + switch(len & 15) + { + case 15: k4 ^= tail[14] << 16; + case 14: k4 ^= tail[13] << 8; + case 13: k4 ^= tail[12] << 0; + k4 *= c4; k4 = ROTL32(k4,18); k4 *= c1; h4 ^= k4; + + case 12: k3 ^= tail[11] << 24; + case 11: k3 ^= tail[10] << 16; + case 10: k3 ^= tail[ 9] << 8; + case 9: k3 ^= tail[ 8] << 0; + k3 *= c3; k3 = ROTL32(k3,17); k3 *= c4; h3 ^= k3; + + case 8: k2 ^= tail[ 7] << 24; + case 7: k2 ^= tail[ 6] << 16; + case 6: k2 ^= tail[ 5] << 8; + case 5: k2 ^= tail[ 4] << 0; + k2 *= c2; k2 = ROTL32(k2,16); k2 *= c3; h2 ^= k2; + + case 4: k1 ^= tail[ 3] << 24; + case 3: k1 ^= tail[ 2] << 16; + case 2: k1 ^= tail[ 1] << 8; + case 1: k1 ^= tail[ 0] << 0; + k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1; + }; + + //---------- + // finalization + + h1 ^= len; h2 ^= len; h3 ^= len; h4 ^= len; + + h1 += h2; h1 += h3; h1 += h4; + h2 += h1; h3 += h1; h4 += h1; + + h1 = fmix(h1); + h2 = fmix(h2); + h3 = fmix(h3); + h4 = fmix(h4); + + h1 += h2; h1 += h3; h1 += h4; + h2 += h1; h3 += h1; h4 += h1; + + ((uint32_t*)out)[0] = h1; + ((uint32_t*)out)[1] = h2; + ((uint32_t*)out)[2] = h3; + ((uint32_t*)out)[3] = h4; +} + +//----------------------------------------------------------------------------- + +void MurmurHash3_x64_128 ( const void * key, const int len, + const uint32_t seed, void * out ) +{ + const uint8_t * data = (const uint8_t*)key; + const int nblocks = len / 16; + + uint64_t h1 = seed; + uint64_t h2 = seed; + + uint64_t c1 = BIG_CONSTANT(0x87c37b91114253d5); + uint64_t c2 = BIG_CONSTANT(0x4cf5ad432745937f); + + //---------- + // body + + const uint64_t * blocks = (const uint64_t *)(data); + + for(int i = 0; i < nblocks; i++) + { + uint64_t k1 = getblock(blocks,i*2+0); + uint64_t k2 = getblock(blocks,i*2+1); + + k1 *= c1; k1 = ROTL64(k1,31); k1 *= c2; h1 ^= k1; + + h1 = ROTL64(h1,27); h1 += h2; h1 = h1*5+0x52dce729; + + k2 *= c2; k2 = ROTL64(k2,33); k2 *= c1; h2 ^= k2; + + h2 = ROTL64(h2,31); h2 += h1; h2 = h2*5+0x38495ab5; + } + + //---------- + // tail + + const uint8_t * tail = (const uint8_t*)(data + nblocks*16); + + uint64_t k1 = 0; + uint64_t k2 = 0; + + switch(len & 15) + { + case 15: k2 ^= uint64_t(tail[14]) << 48; + case 14: k2 ^= uint64_t(tail[13]) << 40; + case 13: k2 ^= uint64_t(tail[12]) << 32; + case 12: k2 ^= uint64_t(tail[11]) << 24; + case 11: k2 ^= uint64_t(tail[10]) << 16; + case 10: k2 ^= uint64_t(tail[ 9]) << 8; + case 9: k2 ^= uint64_t(tail[ 8]) << 0; + k2 *= c2; k2 = ROTL64(k2,33); k2 *= c1; h2 ^= k2; + + case 8: k1 ^= uint64_t(tail[ 7]) << 56; + case 7: k1 ^= uint64_t(tail[ 6]) << 48; + case 6: k1 ^= uint64_t(tail[ 5]) << 40; + case 5: k1 ^= uint64_t(tail[ 4]) << 32; + case 4: k1 ^= uint64_t(tail[ 3]) << 24; + case 3: k1 ^= uint64_t(tail[ 2]) << 16; + case 2: k1 ^= uint64_t(tail[ 1]) << 8; + case 1: k1 ^= uint64_t(tail[ 0]) << 0; + k1 *= c1; k1 = ROTL64(k1,31); k1 *= c2; h1 ^= k1; + }; + + //---------- + // finalization + + h1 ^= len; h2 ^= len; + + h1 += h2; + h2 += h1; + + h1 = fmix(h1); + h2 = fmix(h2); + + h1 += h2; + h2 += h1; + + ((uint64_t*)out)[0] = h1; + ((uint64_t*)out)[1] = h2; +} + +//----------------------------------------------------------------------------- + diff --git a/moses/src/CompactPT/MurmurHash3.h b/moses/src/CompactPT/MurmurHash3.h new file mode 100644 index 000000000..58e98204d --- /dev/null +++ b/moses/src/CompactPT/MurmurHash3.h @@ -0,0 +1,37 @@ +//----------------------------------------------------------------------------- +// MurmurHash3 was written by Austin Appleby, and is placed in the public +// domain. The author hereby disclaims copyright to this source code. + +#ifndef _MURMURHASH3_H_ +#define _MURMURHASH3_H_ + +//----------------------------------------------------------------------------- +// Platform-specific functions and macros + +// Microsoft Visual Studio + +#if defined(_MSC_VER) + +typedef unsigned char uint8_t; +typedef unsigned long uint32_t; +typedef unsigned __int64 uint64_t; + +// Other compilers + +#else // defined(_MSC_VER) + +#include + +#endif // !defined(_MSC_VER) + +//----------------------------------------------------------------------------- + +void MurmurHash3_x86_32 ( const void * key, int len, uint32_t seed, void * out ); + +void MurmurHash3_x86_128 ( const void * key, int len, uint32_t seed, void * out ); + +void MurmurHash3_x64_128 ( const void * key, int len, uint32_t seed, void * out ); + +//----------------------------------------------------------------------------- + +#endif // _MURMURHASH3_H_ diff --git a/moses/src/CompactPT/PackedArray.h b/moses/src/CompactPT/PackedArray.h new file mode 100644 index 000000000..26d4dd1b0 --- /dev/null +++ b/moses/src/CompactPT/PackedArray.h @@ -0,0 +1,177 @@ +#ifndef moses_PackedArray_h +#define moses_PackedArray_h + +#include +#include +#include +#include + +namespace Moses +{ + +template +class PackedArray +{ + protected: + static size_t m_dataBits; + + size_t m_size; + size_t m_storageSize; + D* m_storage; + + public: + PackedArray() + { + m_size = 0; + m_storageSize = 0; + m_storage = new D[0]; + } + + PackedArray(size_t size, size_t bits) : m_size(size) + { + m_storageSize = ceil(float(bits * size) / float(m_dataBits)); + m_storage = new D[m_storageSize]; + } + + PackedArray(const PackedArray &c) + { + m_size = c.m_size; + + m_storageSize = c.m_storageSize; + m_storage = new D[m_storageSize]; + + std::memcpy(m_storage, c.m_storage, m_storageSize * sizeof(D)); + } + + ~PackedArray() + { + delete [] m_storage; + m_size = 0; + m_storageSize = 0; + m_storage = 0; + } + + T Get(size_t i, size_t bits) const + { + T out = 0; + + size_t bitstart = (i * bits); + size_t bitpos = bitstart; + + size_t zero = ((1ul << (bits)) - 1); + + while(bitpos - bitstart < bits) { + size_t pos = bitpos / m_dataBits; + size_t off = bitpos % m_dataBits; + + out |= (T(m_storage[pos]) << (bitpos - bitstart)) >> off; + + bitpos += (m_dataBits - off); + } + + out &= zero; + return out; + } + + void Set(size_t i, T v, size_t bits) + { + size_t bitstart = (i * bits); + size_t bitpos = bitstart; + + while(bitpos - bitstart < bits) { + size_t pos = bitpos / m_dataBits; + size_t off = bitpos % m_dataBits; + + size_t rest = bits - (bitpos - bitstart); + D zero = ~((1ul << (rest + off)) - 1) | ((1ul << off) - 1); + + m_storage[pos] &= zero; + m_storage[pos] |= v << off; + v = v >> (m_dataBits - off); + bitpos += (m_dataBits - off); + } + } + + virtual D*& GetStorage() + { + return m_storage; + } + + virtual size_t GetStorageSize() const + { + return m_storageSize; + } + + virtual size_t Size() const + { + return m_size; + } + + virtual size_t Load(std::FILE* in) + { + size_t a1 = std::ftell(in); + + std::fread(&m_size, sizeof(m_size), 1, in); + std::fread(&m_storageSize, sizeof(m_storageSize), 1, in); + delete [] m_storage; + m_storage = new D[m_storageSize]; + std::fread(m_storage, sizeof(D), m_storageSize, in); + + size_t a2 = std::ftell(in); + return a2 - a1; + } + + virtual size_t Save(std::FILE* out) + { + size_t a1 = std::ftell(out); + + std::fwrite(&m_size, sizeof(m_size), 1, out); + std::fwrite(&m_storageSize, sizeof(m_storageSize), 1, out); + std::fwrite(m_storage, sizeof(D), m_storageSize, out); + + size_t a2 = std::ftell(out); + return a2 - a1; + } + +}; + +template +size_t PackedArray::m_dataBits = sizeof(D)*8; + +/**************************************************************************/ + +template +class PairedPackedArray : public PackedArray +{ + public: + PairedPackedArray() : PackedArray() {} + + PairedPackedArray(size_t size, size_t bits1, size_t bits2) + : PackedArray(size, bits1 + bits2) { } + + void Set(size_t i, T a, T b, size_t bits1, size_t bits2) + { + T c = 0; + c = a | (b << bits1); + PackedArray::Set(i, c, bits1 + bits2); + } + + void Set(size_t i, std::pair p, size_t bits1, size_t bits2) + { + T c = 0; + c = p.second | (p.first << bits1); + PackedArray::Set(i, c); + } + + std::pair Get(size_t i, size_t bits1, size_t bits2) + { + T v = PackedArray::Get(i, bits1 + bits2); + T a = v & ((1 << bits1) - 1); + T b = v >> bits1; + return std::pair(a, b); + } +}; + +} + +#endif \ No newline at end of file diff --git a/moses/src/CompactPT/PhraseDecoder.cpp b/moses/src/CompactPT/PhraseDecoder.cpp new file mode 100644 index 000000000..ca17881be --- /dev/null +++ b/moses/src/CompactPT/PhraseDecoder.cpp @@ -0,0 +1,469 @@ +#include + +#include "PhraseDecoder.h" + +namespace Moses +{ + +PhraseDecoder::PhraseDecoder( + PhraseDictionaryCompact &phraseDictionary, + const std::vector* &input, + const std::vector* &output, + const PhraseDictionaryFeature* feature, + size_t numScoreComponent, + const std::vector* weight, + float weightWP, + const LMList* languageModels +) + : m_coding(None), m_numScoreComponent(numScoreComponent), + m_containsAlignmentInfo(true), m_maxRank(0), + m_symbolTree(0), m_multipleScoreTrees(false), + m_scoreTrees(1), m_alignTree(0), + m_phraseDictionary(phraseDictionary), m_input(input), m_output(output), + m_feature(feature), m_weight(weight), + m_weightWP(weightWP), m_languageModels(languageModels), + m_separator(" ||| ") +{ } + +PhraseDecoder::~PhraseDecoder() +{ + if(m_symbolTree) + delete m_symbolTree; + + for(size_t i = 0; i < m_scoreTrees.size(); i++) + if(m_scoreTrees[i]) + delete m_scoreTrees[i]; + + if(m_alignTree) + delete m_alignTree; +} + +inline unsigned PhraseDecoder::GetSourceSymbolId(std::string& symbol) +{ + boost::unordered_map::iterator it + = m_sourceSymbolsMap.find(symbol); + if(it != m_sourceSymbolsMap.end()) + return it->second; + + size_t idx = m_sourceSymbols.find(symbol); + m_sourceSymbolsMap[symbol] = idx; + return idx; +} + +inline std::string PhraseDecoder::GetTargetSymbol(unsigned idx) const +{ + if(idx < m_targetSymbols.size()) + return m_targetSymbols[idx]; + return std::string("##ERROR##"); +} + +inline size_t PhraseDecoder::GetREncType(unsigned encodedSymbol) +{ + return (encodedSymbol >> 30) + 1; +} + +inline size_t PhraseDecoder::GetPREncType(unsigned encodedSymbol) +{ + return (encodedSymbol >> 31) + 1; +} + +inline unsigned PhraseDecoder::GetTranslation(unsigned srcIdx, size_t rank) +{ + size_t srcTrgIdx = m_lexicalTableIndex[srcIdx]; + return m_lexicalTable[srcTrgIdx + rank].second; +} + +size_t PhraseDecoder::GetMaxSourcePhraseLength() +{ + return m_maxPhraseLength; +} + +inline unsigned PhraseDecoder::DecodeREncSymbol1(unsigned encodedSymbol) +{ + return encodedSymbol &= ~(3 << 30); +} + +inline unsigned PhraseDecoder::DecodeREncSymbol2Rank(unsigned encodedSymbol) +{ + return encodedSymbol &= ~(255 << 24); +} + +inline unsigned PhraseDecoder::DecodeREncSymbol2Position(unsigned encodedSymbol) +{ + encodedSymbol &= ~(3 << 30); + encodedSymbol >>= 24; + return encodedSymbol; +} + +inline unsigned PhraseDecoder::DecodeREncSymbol3(unsigned encodedSymbol) +{ + return encodedSymbol &= ~(3 << 30); +} + +inline unsigned PhraseDecoder::DecodePREncSymbol1(unsigned encodedSymbol) +{ + return encodedSymbol &= ~(1 << 31); +} + +inline int PhraseDecoder::DecodePREncSymbol2Left(unsigned encodedSymbol) +{ + return ((encodedSymbol >> 25) & 63) - 32; +} + +inline int PhraseDecoder::DecodePREncSymbol2Right(unsigned encodedSymbol) +{ + return ((encodedSymbol >> 19) & 63) - 32; +} + +inline unsigned PhraseDecoder::DecodePREncSymbol2Rank(unsigned encodedSymbol) +{ + return (encodedSymbol & 524287); +} + +size_t PhraseDecoder::Load(std::FILE* in) +{ + size_t start = std::ftell(in); + + std::fread(&m_coding, sizeof(m_coding), 1, in); + std::fread(&m_numScoreComponent, sizeof(m_numScoreComponent), 1, in); + std::fread(&m_containsAlignmentInfo, sizeof(m_containsAlignmentInfo), 1, in); + std::fread(&m_maxRank, sizeof(m_maxRank), 1, in); + std::fread(&m_maxPhraseLength, sizeof(m_maxPhraseLength), 1, in); + + if(m_coding == REnc) + { + m_sourceSymbols.load(in); + + size_t size; + std::fread(&size, sizeof(size_t), 1, in); + m_lexicalTableIndex.resize(size); + std::fread(&m_lexicalTableIndex[0], sizeof(size_t), size, in); + + std::fread(&size, sizeof(size_t), 1, in); + m_lexicalTable.resize(size); + std::fread(&m_lexicalTable[0], sizeof(SrcTrg), size, in); + } + + m_targetSymbols.load(in); + + m_symbolTree = new CanonicalHuffman(in); + + std::fread(&m_multipleScoreTrees, sizeof(m_multipleScoreTrees), 1, in); + if(m_multipleScoreTrees) + { + m_scoreTrees.resize(m_numScoreComponent); + for(size_t i = 0; i < m_numScoreComponent; i++) + m_scoreTrees[i] = new CanonicalHuffman(in); + } + else + { + m_scoreTrees.resize(1); + m_scoreTrees[0] = new CanonicalHuffman(in); + } + + if(m_containsAlignmentInfo) + m_alignTree = new CanonicalHuffman(in); + + size_t end = std::ftell(in); + return end - start; +} + +std::string PhraseDecoder::MakeSourceKey(std::string &source) +{ + return source + m_separator; +} + +TargetPhraseVectorPtr PhraseDecoder::CreateTargetPhraseCollection(const Phrase &sourcePhrase, bool topLevel) +{ + + // Not using TargetPhraseCollection avoiding "new" operator + // which can introduce heavy locking with multiple threads + TargetPhraseVectorPtr tpv(new TargetPhraseVector()); + size_t bitsLeft = 0; + + if(m_coding == PREnc) + { + std::pair cachedPhraseColl + = m_decodingCache.Retrieve(sourcePhrase); + + // Has been cached and is complete or does not need to be completed + if(cachedPhraseColl.first != NULL && (!topLevel || cachedPhraseColl.second == 0)) + return cachedPhraseColl.first; + + // Has been cached, but is incomplete + else if(cachedPhraseColl.first != NULL) + { + bitsLeft = cachedPhraseColl.second; + tpv->resize(cachedPhraseColl.first->size()); + std::copy(cachedPhraseColl.first->begin(), + cachedPhraseColl.first->end(), + tpv->begin()); + } + } + + // Retrieve source phrase identifier + std::string sourcePhraseString = sourcePhrase.GetStringRep(*m_input); + size_t sourcePhraseId = m_phraseDictionary.m_hash[MakeSourceKey(sourcePhraseString)]; + + if(sourcePhraseId != m_phraseDictionary.m_hash.GetSize()) + { + // Retrieve compressed and encoded target phrase collection + std::string encodedPhraseCollection; + if(m_phraseDictionary.m_inMemory) + encodedPhraseCollection = m_phraseDictionary.m_targetPhrasesMemory[sourcePhraseId]; + else + encodedPhraseCollection = m_phraseDictionary.m_targetPhrasesMapped[sourcePhraseId]; + + BitStream<> encodedBitStream(encodedPhraseCollection); + if(m_coding == PREnc && bitsLeft) + encodedBitStream.SetLeft(bitsLeft); + + // Decompress and decode target phrase collection + TargetPhraseVectorPtr decodedPhraseColl = + DecodeCollection(tpv, encodedBitStream, sourcePhrase, topLevel); + + return decodedPhraseColl; + } + else + return TargetPhraseVectorPtr(); +} + +TargetPhraseVectorPtr PhraseDecoder::DecodeCollection( + TargetPhraseVectorPtr tpv, BitStream<> &encodedBitStream, + const Phrase &sourcePhrase, bool topLevel) +{ + + bool extending = tpv->size(); + size_t bitsLeft = encodedBitStream.RemainingBits(); + + typedef std::pair AlignPointSizeT; + + std::vector sourceWords; + if(m_coding == REnc) + { + for(size_t i = 0; i < sourcePhrase.GetSize(); i++) + { + std::string sourceWord + = sourcePhrase.GetWord(i).GetString(*m_input, false); + unsigned idx = GetSourceSymbolId(sourceWord); + sourceWords.push_back(idx); + } + } + + unsigned phraseStopSymbol = 0; + AlignPoint alignStopSymbol(-1, -1); + + std::vector scores; + std::set alignment; + + enum DecodeState { New, Symbol, Score, Alignment, Add } state = New; + + size_t srcSize = sourcePhrase.GetSize(); + + TargetPhrase* targetPhrase = NULL; + while(encodedBitStream.RemainingBits()) + { + + if(state == New) + { + // Creating new TargetPhrase on the heap + tpv->push_back(TargetPhrase(Output)); + targetPhrase = &tpv->back(); + + targetPhrase->SetSourcePhrase(&sourcePhrase); + alignment.clear(); + scores.clear(); + + state = Symbol; + } + + if(state == Symbol) + { + unsigned symbol = m_symbolTree->NextSymbol(encodedBitStream); + + if(symbol == phraseStopSymbol) + { + state = Score; + } + else + { + if(m_coding == REnc) + { + std::string wordString; + size_t type = GetREncType(symbol); + + if(type == 1) + { + unsigned decodedSymbol = DecodeREncSymbol1(symbol); + wordString = GetTargetSymbol(decodedSymbol); + } + else if (type == 2) + { + size_t rank = DecodeREncSymbol2Rank(symbol); + size_t srcPos = DecodeREncSymbol2Position(symbol); + + if(srcPos >= sourceWords.size()) + return TargetPhraseVectorPtr(); + + wordString = GetTargetSymbol(GetTranslation(sourceWords[srcPos], rank)); + if(StaticData::Instance().UseAlignmentInfo()) + { + size_t trgPos = targetPhrase->GetSize(); + alignment.insert(AlignPoint(srcPos, trgPos)); + } + } + else if(type == 3) + { + size_t rank = DecodeREncSymbol3(symbol); + size_t srcPos = targetPhrase->GetSize(); + + if(srcPos >= sourceWords.size()) + return TargetPhraseVectorPtr(); + + wordString = GetTargetSymbol(GetTranslation(sourceWords[srcPos], rank)); + if(StaticData::Instance().UseAlignmentInfo()) + { + size_t trgPos = srcPos; + alignment.insert(AlignPoint(srcPos, trgPos)); + } + } + + Word word; + word.CreateFromString(Output, *m_output, wordString, false); + targetPhrase->AddWord(word); + } + else if(m_coding == PREnc) + { + // if the symbol is just a word + if(GetPREncType(symbol) == 1) + { + unsigned decodedSymbol = DecodePREncSymbol1(symbol); + Word word; + word.CreateFromString(Output, *m_output, + GetTargetSymbol(decodedSymbol), false); + targetPhrase->AddWord(word); + } + // if the symbol is a subphrase pointer + else + { + int left = DecodePREncSymbol2Left(symbol); + int right = DecodePREncSymbol2Right(symbol); + unsigned rank = DecodePREncSymbol2Rank(symbol); + + int srcStart = left + targetPhrase->GetSize(); + int srcEnd = srcSize - right - 1; + + // false positive consistency check + if(0 > srcStart || srcStart > srcEnd || unsigned(srcEnd) >= srcSize) + return TargetPhraseVectorPtr(); + + // false positive consistency check + if(m_maxRank && rank > m_maxRank) + return TargetPhraseVectorPtr(); + + // set subphrase by default to itself + TargetPhraseVectorPtr subTpv = tpv; + + // if range smaller than source phrase retrieve subphrase + if(unsigned(srcEnd - srcStart + 1) != srcSize) + { + Phrase subPhrase = sourcePhrase.GetSubString(WordsRange(srcStart, srcEnd)); + subTpv = CreateTargetPhraseCollection(subPhrase, false); + } + + // false positive consistency check + if(subTpv != NULL && rank < subTpv->size()) + { + // insert the subphrase into the main target phrase + TargetPhrase& subTp = subTpv->at(rank); + if(StaticData::Instance().UseAlignmentInfo()) + { + // reconstruct the alignment data based on the alignment of the subphrase + for(AlignmentInfo::const_iterator it = subTp.GetAlignmentInfo().begin(); + it != subTp.GetAlignmentInfo().end(); it++) + { + alignment.insert(AlignPointSizeT(srcStart + it->first, + targetPhrase->GetSize() + it->second)); + } + } + targetPhrase->Append(subTp); + } + else + return TargetPhraseVectorPtr(); + } + } + else + { + Word word; + word.CreateFromString(Output, *m_output, + GetTargetSymbol(symbol), false); + targetPhrase->AddWord(word); + } + } + } + else if(state == Score) + { + size_t idx = m_multipleScoreTrees ? scores.size() : 0; + float score = m_scoreTrees[idx]->NextSymbol(encodedBitStream); + scores.push_back(score); + + if(scores.size() == m_numScoreComponent) + { + targetPhrase->SetScore(m_feature, scores, *m_weight, m_weightWP, *m_languageModels); + + if(m_containsAlignmentInfo) + state = Alignment; + else + state = Add; + } + } + else if(state == Alignment) + { + AlignPoint alignPoint = m_alignTree->NextSymbol(encodedBitStream); + if(alignPoint == alignStopSymbol) + { + state = Add; + } + else + { + if(StaticData::Instance().UseAlignmentInfo()) + alignment.insert(AlignPointSizeT(alignPoint)); + } + } + + if(state == Add) + { + if(StaticData::Instance().UseAlignmentInfo()) + targetPhrase->SetAlignmentInfo(alignment); + + if(m_coding == PREnc) + { + if(!m_maxRank || tpv->size() <= m_maxRank) + bitsLeft = encodedBitStream.RemainingBits(); + + if(!topLevel && m_maxRank && tpv->size() >= m_maxRank) + break; + } + + if(encodedBitStream.RemainingBits() <= 8) + break; + + state = New; + } + } + + if(m_coding == PREnc && !extending) + { + bitsLeft = bitsLeft > 8 ? bitsLeft : 0; + m_decodingCache.Cache(sourcePhrase, tpv, bitsLeft, m_maxRank); + } + + return tpv; +} + +void PhraseDecoder::PruneCache() +{ + m_decodingCache.Prune(); +} + +} diff --git a/moses/src/CompactPT/PhraseDecoder.h b/moses/src/CompactPT/PhraseDecoder.h new file mode 100644 index 000000000..f02212431 --- /dev/null +++ b/moses/src/CompactPT/PhraseDecoder.h @@ -0,0 +1,133 @@ +#ifndef moses_PhraseDecoder_h +#define moses_PhraseDecoder_h + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "TypeDef.h" +#include "FactorCollection.h" +#include "Word.h" +#include "Util.h" +#include "InputFileStream.h" +#include "StaticData.h" +#include "WordsRange.h" +#include "UserMessage.h" + +#include "PhraseDictionaryCompact.h" +#include "StringVector.h" +#include "CanonicalHuffman.h" +#include "ConsistantPhrases.h" +#include "TargetPhraseCollectionCache.h" + +namespace Moses +{ + +class PhraseDictionaryCompact; + +class PhraseDecoder +{ + protected: + + friend class PhraseDictionaryCompact; + + typedef std::pair AlignPoint; + typedef std::pair SrcTrg; + + enum Coding { None, REnc, PREnc } m_coding; + + size_t m_numScoreComponent; + bool m_containsAlignmentInfo; + size_t m_maxRank; + size_t m_maxPhraseLength; + + boost::unordered_map m_sourceSymbolsMap; + StringVector m_sourceSymbols; + StringVector m_targetSymbols; + + std::vector m_lexicalTableIndex; + std::vector m_lexicalTable; + + CanonicalHuffman* m_symbolTree; + + bool m_multipleScoreTrees; + std::vector*> m_scoreTrees; + + CanonicalHuffman* m_alignTree; + + TargetPhraseCollectionCache m_decodingCache; + + PhraseDictionaryCompact& m_phraseDictionary; + + // *********************************************** + + const std::vector* m_input; + const std::vector* m_output; + const PhraseDictionaryFeature* m_feature; + const std::vector* m_weight; + float m_weightWP; + const LMList* m_languageModels; + + std::string m_separator; + + // *********************************************** + + unsigned GetSourceSymbolId(std::string& s); + std::string GetTargetSymbol(unsigned id) const; + + size_t GetREncType(unsigned encodedSymbol); + size_t GetPREncType(unsigned encodedSymbol); + + unsigned GetTranslation(unsigned srcIdx, size_t rank); + + size_t GetMaxSourcePhraseLength(); + + unsigned DecodeREncSymbol1(unsigned encodedSymbol); + unsigned DecodeREncSymbol2Rank(unsigned encodedSymbol); + unsigned DecodeREncSymbol2Position(unsigned encodedSymbol); + unsigned DecodeREncSymbol3(unsigned encodedSymbol); + + unsigned DecodePREncSymbol1(unsigned encodedSymbol); + int DecodePREncSymbol2Left(unsigned encodedSymbol); + int DecodePREncSymbol2Right(unsigned encodedSymbol); + unsigned DecodePREncSymbol2Rank(unsigned encodedSymbol); + + std::string MakeSourceKey(std::string &); + + public: + + PhraseDecoder( + PhraseDictionaryCompact &phraseDictionary, + const std::vector* &input, + const std::vector* &output, + const PhraseDictionaryFeature* feature, + size_t numScoreComponent, + const std::vector* weight, + float weightWP, + const LMList* languageModels + ); + + ~PhraseDecoder(); + + size_t Load(std::FILE* in); + + TargetPhraseVectorPtr CreateTargetPhraseCollection(const Phrase &sourcePhrase, + bool topLevel = false); + + TargetPhraseVectorPtr DecodeCollection(TargetPhraseVectorPtr tpv, + BitStream<> &encodedBitStream, + const Phrase &sourcePhrase, + bool topLevel); + + void PruneCache(); +}; + +} + +#endif \ No newline at end of file diff --git a/moses/src/CompactPT/PhraseDictionaryCompact.cpp b/moses/src/CompactPT/PhraseDictionaryCompact.cpp new file mode 100644 index 000000000..8bd078d8c --- /dev/null +++ b/moses/src/CompactPT/PhraseDictionaryCompact.cpp @@ -0,0 +1,188 @@ +// $Id: PhraseDictionaryMemoryHashed.cpp 3908 2011-02-28 11:41:08Z pjwilliams $ +// vim:tabstop=2 + +/*********************************************************************** +Moses - factored phrase-based language decoder +Copyright (C) 2006 University of Edinburgh + +This library is free software; you can redistribute it and/or +modify it under the terms of the GNU Lesser General Public +License as published by the Free Software Foundation; either +version 2.1 of the License, or (at your option) any later version. + +This library is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +Lesser General Public License for more details. + +You should have received a copy of the GNU Lesser General Public +License along with this library; if not, write to the Free Software +Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ + +#include +#include +#include +#include +#include +#include + +#include "PhraseDictionaryCompact.h" +#include "FactorCollection.h" +#include "Word.h" +#include "Util.h" +#include "InputFileStream.h" +#include "StaticData.h" +#include "WordsRange.h" +#include "UserMessage.h" +#include "ThreadPool.h" + +using namespace std; + +namespace Moses +{ + +bool PhraseDictionaryCompact::Load(const std::vector &input + , const std::vector &output + , const string &filePath + , const vector &weight + , size_t tableLimit + , const LMList &languageModels + , float weightWP) +{ + m_input = &input; + m_output = &output; + m_weight = &weight; + m_tableLimit = tableLimit; + m_languageModels = &languageModels; + m_weightWP = weightWP; + + std::string fullFilePath = filePath; + + m_phraseDecoder = new PhraseDecoder(*this, m_input, m_output, m_feature, + m_numScoreComponent, m_weight, m_weightWP, + m_languageModels); + + std::FILE* pFile = std::fopen(fullFilePath.c_str() , "r"); + + size_t indexSize; + if(m_inMemory) + // Load source phrase index into memory + indexSize = m_hash.Load(pFile); + else + // Keep source phrase index on disk + indexSize = m_hash.LoadIndex(pFile); + + + size_t coderSize = m_phraseDecoder->Load(pFile); + + size_t phraseSize; + if(m_inMemory) + // Load target phrase collections into memory + phraseSize = m_targetPhrasesMemory.load(pFile, false); + else + // Keep target phrase collections on disk + phraseSize = m_targetPhrasesMapped.load(pFile, true); + + return indexSize && coderSize && phraseSize; +} + +struct CompareTargetPhrase { + bool operator() (const TargetPhrase &a, const TargetPhrase &b) { + return a.GetFutureScore() > b.GetFutureScore(); + } +}; + +const TargetPhraseCollection* +PhraseDictionaryCompact::GetTargetPhraseCollection(const Phrase &sourcePhrase) const { + + // There is no souch source phrase if source phrase is longer than longest + // observed source phrase during compilation + if(sourcePhrase.GetSize() > m_phraseDecoder->GetMaxSourcePhraseLength()) + return NULL; + + // Retrieve target phrase collection from phrase table + TargetPhraseVectorPtr decodedPhraseColl + = m_phraseDecoder->CreateTargetPhraseCollection(sourcePhrase, true); + + if(decodedPhraseColl != NULL && decodedPhraseColl->size()) { + TargetPhraseVectorPtr tpv(new TargetPhraseVector(*decodedPhraseColl)); + TargetPhraseCollection* phraseColl = new TargetPhraseCollection(); + + // Score phrases and if possible apply ttable_limit + TargetPhraseVector::iterator nth = + (m_tableLimit == 0 || tpv->size() < m_tableLimit) ? + tpv->end() : tpv->begin() + m_tableLimit; + std::nth_element(tpv->begin(), nth, tpv->end(), CompareTargetPhrase()); + for(TargetPhraseVector::iterator it = tpv->begin(); it != nth; it++) + phraseColl->Add(new TargetPhrase(*it)); + + // Cache phrase pair for for clean-up or retrieval with PREnc + const_cast(this)->CacheForCleanup(sourcePhrase, phraseColl); + + return phraseColl; + } + else + return NULL; + +} + +PhraseDictionaryCompact::~PhraseDictionaryCompact() { + if(m_phraseDecoder) + delete m_phraseDecoder; +} + +//TO_STRING_BODY(PhraseDictionaryCompact) + +TargetPhraseCollection* +PhraseDictionaryCompact::RetrieveFromCache(const Phrase &sourcePhrase) { +#ifdef WITH_THREADS + boost::mutex::scoped_lock lock(m_sentenceMutex); + PhraseCache &ref = m_sentenceCache[pthread_self()]; +#else + PhraseCache &ref = m_sentenceCache; +#endif + PhraseCache::iterator it = ref.find(sourcePhrase); + if(it != ref.end()) + return it->second; + else + return NULL; +} + +void PhraseDictionaryCompact::CacheForCleanup(const Phrase &sourcePhrase, + TargetPhraseCollection* tpc) { +#ifdef WITH_THREADS + boost::mutex::scoped_lock lock(m_sentenceMutex); + m_sentenceCache[pthread_self()].insert(std::make_pair(sourcePhrase, tpc)); +#else + m_sentenceCache.insert(std::make_pair(sourcePhrase, tpc)); +#endif +} + +void PhraseDictionaryCompact::InitializeForInput(const Moses::InputType&) {} + +void PhraseDictionaryCompact::AddEquivPhrase(const Phrase &source, + const TargetPhrase &targetPhrase) { } + +void PhraseDictionaryCompact::CleanUp() { + if(!m_inMemory) + m_hash.KeepNLastRanges(0.01, 0.2); + + m_phraseDecoder->PruneCache(); + +#ifdef WITH_THREADS + boost::mutex::scoped_lock lock(m_sentenceMutex); + PhraseCache &ref = m_sentenceCache[pthread_self()]; +#else + PhraseCache &ref = m_sentenceCache; +#endif + + for(PhraseCache::iterator it = ref.begin(); it != ref.end(); it++) + delete it->second; + + PhraseCache temp; + temp.swap(ref); +} + +} + diff --git a/moses/src/CompactPT/PhraseDictionaryCompact.h b/moses/src/CompactPT/PhraseDictionaryCompact.h new file mode 100644 index 000000000..926d2e823 --- /dev/null +++ b/moses/src/CompactPT/PhraseDictionaryCompact.h @@ -0,0 +1,119 @@ + +/*********************************************************************** +Moses - factored phrase-based language decoder +Copyright (C) 2006 University of Edinburgh + +This library is free software; you can redistribute it and/or +modify it under the terms of the GNU Lesser General Public +License as published by the Free Software Foundation; either +version 2.1 of the License, or (at your option) any later version. + +This library is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +Lesser General Public License for more details. + +You should have received a copy of the GNU Lesser General Public +License along with this library; if not, write to the Free Software +Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ + +#ifndef moses_PhraseDictionaryCompact_h +#define moses_PhraseDictionaryCompact_h + +#include + +#ifdef WITH_THREADS +#ifdef BOOST_HAS_PTHREADS +#include +#endif +#endif + +#include "PhraseDictionary.h" +#include "ThreadPool.h" + +#include "BlockHashIndex.h" +#include "StringVector.h" +#include "PhraseDecoder.h" +#include "TargetPhraseCollectionCache.h" + +namespace Moses +{ + +class PhraseDecoder; + +class PhraseDictionaryCompact : public PhraseDictionary +{ +protected: + friend class PhraseDecoder; + + PhraseTableImplementation m_implementation; + bool m_inMemory; + + typedef std::map PhraseCache; +#ifdef WITH_THREADS + boost::mutex m_sentenceMutex; + typedef std::map SentenceCache; +#else + typedef PhraseCache SentenceCache; +#endif + SentenceCache m_sentenceCache; + + BlockHashIndex m_hash; + PhraseDecoder* m_phraseDecoder; + + StringVector m_targetPhrasesMapped; + StringVector m_targetPhrasesMemory; + + const std::vector* m_input; + const std::vector* m_output; + + const std::vector* m_weight; + const LMList* m_languageModels; + float m_weightWP; + +public: + PhraseDictionaryCompact(size_t numScoreComponent, + PhraseTableImplementation implementation, + PhraseDictionaryFeature* feature) + : PhraseDictionary(numScoreComponent, feature), + m_implementation(implementation), + m_inMemory(StaticData::Instance().UseMinphrInMemory()), + m_hash(10, 16), + m_phraseDecoder(0) + {} + + virtual ~PhraseDictionaryCompact(); + + bool Load(const std::vector &input + , const std::vector &output + , const std::string &filePath + , const std::vector &weight + , size_t tableLimit + , const LMList &languageModels + , float weightWP); + + const TargetPhraseCollection* GetTargetPhraseCollection(const Phrase &source) const; + + void AddEquivPhrase(const Phrase &source, const TargetPhrase &targetPhrase); + + void InitializeForInput(const Moses::InputType&); + + TargetPhraseCollection* RetrieveFromCache(const Phrase &sourcePhrase); + void CacheForCleanup(const Phrase &source, TargetPhraseCollection* tpc); + void CleanUp(); + + virtual ChartRuleLookupManager *CreateRuleLookupManager( + const InputType &, + const ChartCellCollection &) + { + assert(false); + return 0; + } + + TO_STRING(); + +}; + +} +#endif diff --git a/moses/src/CompactPT/PhraseTableCreator.cpp b/moses/src/CompactPT/PhraseTableCreator.cpp new file mode 100644 index 000000000..9ec333223 --- /dev/null +++ b/moses/src/CompactPT/PhraseTableCreator.cpp @@ -0,0 +1,1195 @@ +#include + +#include "PhraseTableCreator.h" + +namespace Moses +{ + +std::string PhraseTableCreator::m_phraseStopSymbol = "__SPECIAL_STOP_SYMBOL__"; +std::string PhraseTableCreator::m_separator = " ||| "; + +PhraseTableCreator::PhraseTableCreator(std::string inPath, + std::string outPath, + size_t numScoreComponent, + Coding coding, + size_t orderBits, + size_t fingerPrintBits, + bool useAlignmentInfo, + bool multipleScoreTrees, + size_t quantize, + size_t maxRank +#ifdef WITH_THREADS + , size_t threads +#endif + ) + : m_inPath(inPath), m_outPath(outPath), + m_outFile(std::fopen(m_outPath.c_str(), "w")), m_numScoreComponent(numScoreComponent), + m_coding(coding), m_orderBits(orderBits), m_fingerPrintBits(fingerPrintBits), + m_useAlignmentInfo(useAlignmentInfo), + m_multipleScoreTrees(multipleScoreTrees), + m_quantize(quantize), m_maxRank(maxRank), + #ifdef WITH_THREADS + m_threads(threads), + m_srcHash(m_orderBits, m_fingerPrintBits, 1), + m_rnkHash(10, 24, m_threads), + #else + m_srcHash(m_orderBits, m_fingerPrintBits), + m_rnkHash(m_orderBits, m_fingerPrintBits), + #endif + m_maxPhraseLength(0), + m_lastFlushedLine(-1), m_lastFlushedSourceNum(0), + m_lastFlushedSourcePhrase("") +{ + PrintInfo(); + + AddTargetSymbolId(m_phraseStopSymbol); + + size_t cur_pass = 1; + size_t all_passes = 2; + if(m_coding == PREnc) + all_passes = 3; + + m_scoreCounters.resize(m_multipleScoreTrees ? m_numScoreComponent : 1); + for(std::vector::iterator it = m_scoreCounters.begin(); + it != m_scoreCounters.end(); it++) + *it = new ScoreCounter(); + m_scoreTrees.resize(m_multipleScoreTrees ? m_numScoreComponent : 1); + + // 0th pass + if(m_coding == REnc) + { + size_t found = inPath.find_last_of("/\\"); + std::string path = inPath.substr(0, found); + LoadLexicalTable(path + "/lex.f2e"); + } + else if(m_coding == PREnc) + { + std::cerr << "Pass " << cur_pass << "/" << all_passes << ": Creating hash function for rank assignment" << std::endl; + cur_pass++; + CreateRankHash(); + } + + // 1st pass + std::cerr << "Pass " << cur_pass << "/" << all_passes << ": Creating source phrase index + Encoding target phrases" << std::endl; + m_srcHash.BeginSave(m_outFile); + EncodeTargetPhrases(); + + cur_pass++; + + std::cerr << "Intermezzo: Calculating Huffman code sets" << std::endl; + CalcHuffmanCodes(); + + // 2nd pass + std::cerr << "Pass " << cur_pass << "/" << all_passes << ": Compressing target phrases" << std::endl; + CompressTargetPhrases(); + + std::cerr << "Saving to " << m_outPath << std::endl; + Save(); + std::cerr << "Done" << std::endl; + std::fclose(m_outFile); +} + +void PhraseTableCreator::PrintInfo() +{ + std::string encodings[3] = {"Huffman", "Huffman + REnc", "Huffman + PREnc"}; + + std::cerr << "Used options:" << std::endl; + std::cerr << "\tText phrase table will be read from: " << m_inPath << std::endl; + std::cerr << "\tOuput phrase table will be written to: " << m_outPath << std::endl; + std::cerr << "\tStep size for source landmark phrases: 2^" << m_orderBits << "=" << (1ul << m_orderBits) << std::endl; + std::cerr << "\tSource phrase fingerprint size: " << m_fingerPrintBits << " bits / P(fp)=" << (float(1)/(1ul << m_fingerPrintBits)) << std::endl; + std::cerr << "\tSelected target phrase encoding: " << encodings[m_coding] << std::endl; + if(m_coding == PREnc) + { + std::cerr << "\tMaxiumum allowed rank for PREnc: "; + if(!m_maxRank) + std::cerr << "unlimited" << std::endl; + else + std::cerr << m_maxRank << std::endl; + } + std::cerr << "\tNumber of score components in phrase table: " << m_numScoreComponent << std::endl; + std::cerr << "\tSingle Huffman code set for score components: " << (m_multipleScoreTrees ? "no" : "yes") << std::endl; + std::cerr << "\tUsing score quantization: "; + if(m_quantize) + std::cerr << m_quantize << " best" << std::endl; + else + std::cerr << "no" << std::endl; + std::cerr << "\tExplicitly included alignment information: " << (m_useAlignmentInfo ? "yes" : "no") << std::endl; + +#ifdef WITH_THREADS + std::cerr << "\tRunning with " << m_threads << " threads" << std::endl; +#endif + std::cerr << std::endl; +} + +void PhraseTableCreator::Save() +{ + // Save type of encoding + std::fwrite(&m_coding, sizeof(m_coding), 1, m_outFile); + std::fwrite(&m_numScoreComponent, sizeof(m_numScoreComponent), 1, m_outFile); + std::fwrite(&m_useAlignmentInfo, sizeof(m_useAlignmentInfo), 1, m_outFile); + std::fwrite(&m_maxRank, sizeof(m_maxRank), 1, m_outFile); + std::fwrite(&m_maxPhraseLength, sizeof(m_maxPhraseLength), 1, m_outFile); + + if(m_coding == REnc) + { + // Save source language symbols for REnc + std::vector temp1; + temp1.resize(m_sourceSymbolsMap.size()); + for(boost::unordered_map::iterator it + = m_sourceSymbolsMap.begin(); it != m_sourceSymbolsMap.end(); it++) + temp1[it->second] = it->first; + std::sort(temp1.begin(), temp1.end()); + StringVector sourceSymbols; + for(std::vector::iterator it = temp1.begin(); + it != temp1.end(); it++) + sourceSymbols.push_back(*it); + sourceSymbols.save(m_outFile); + + // Save lexical translation table for REnc + size_t size = m_lexicalTableIndex.size(); + std::fwrite(&size, sizeof(size_t), 1, m_outFile); + std::fwrite(&m_lexicalTableIndex[0], sizeof(size_t), size, m_outFile); + size = m_lexicalTable.size(); + std::fwrite(&size, sizeof(size_t), 1, m_outFile); + std::fwrite(&m_lexicalTable[0], sizeof(SrcTrg), size, m_outFile); + } + + // Save target language symbols + std::vector temp2; + temp2.resize(m_targetSymbolsMap.size()); + for(boost::unordered_map::iterator it + = m_targetSymbolsMap.begin(); it != m_targetSymbolsMap.end(); it++) + temp2[it->second] = it->first; + StringVector targetSymbols; + for(std::vector::iterator it = temp2.begin(); + it != temp2.end(); it++) + targetSymbols.push_back(*it); + targetSymbols.save(m_outFile); + + // Save Huffman codes for target language symbols + m_symbolTree->Save(m_outFile); + + // Save number of Huffman code sets for scores and + // save Huffman code sets + std::fwrite(&m_multipleScoreTrees, sizeof(m_multipleScoreTrees), 1, m_outFile); + size_t numScoreTrees = m_scoreTrees.size(); + for(size_t i = 0; i < numScoreTrees; i++) + m_scoreTrees[i]->Save(m_outFile); + + // Save Huffman codes for alignments + if(m_useAlignmentInfo) + m_alignTree->Save(m_outFile); + + // Save compressed target phrase collections + m_compressedTargetPhrases.save(m_outFile); +} + +void PhraseTableCreator::LoadLexicalTable(std::string filePath) +{ + std::vector t_lexTable; + + std::cerr << "Reading in lexical table for Rank Encoding" << std::endl; + std::ifstream lexIn(filePath.c_str(), std::ifstream::in); + std::string src, trg; + float prob; + + // Reading in the translation probability lexicon + + std::cerr << "\tLoading from " << filePath << std::endl; + while(lexIn >> trg >> src >> prob) + { + t_lexTable.push_back(SrcTrgProb(SrcTrgString(src, trg), prob)); + AddSourceSymbolId(src); + AddTargetSymbolId(trg); + } + + // Sorting lexicon by source words by lexicographical order, corresponding + // target words by decreasing probability. + + std::cerr << "\tSorting according to translation rank" << std::endl; + std::sort(t_lexTable.begin(), t_lexTable.end(), SrcTrgProbSorter()); + + // Re-assigning source word ids in lexicographical order + + std::vector temp1; + temp1.resize(m_sourceSymbolsMap.size()); + for(boost::unordered_map::iterator it + = m_sourceSymbolsMap.begin(); it != m_sourceSymbolsMap.end(); it++) + temp1[it->second] = it->first; + + std::sort(temp1.begin(), temp1.end()); + + for(size_t i = 0; i < temp1.size(); i++) + m_sourceSymbolsMap[temp1[i]] = i; + + // Building the lexicon based on source and target word ids + + std::string srcWord = ""; + size_t srcIdx = 0; + for(std::vector::iterator it = t_lexTable.begin(); + it != t_lexTable.end(); it++) + { + + // If we encounter a new source word + if(it->first.first != srcWord) + { + srcIdx = GetSourceSymbolId(it->first.first); + + // Store position of first translation + if(srcIdx >= m_lexicalTableIndex.size()) + m_lexicalTableIndex.resize(srcIdx + 1); + m_lexicalTableIndex[srcIdx] = m_lexicalTable.size(); + } + + // Store pair of source word and target word + size_t trgIdx = GetTargetSymbolId(it->first.second); + m_lexicalTable.push_back(SrcTrg(srcIdx, trgIdx)); + + srcWord = it->first.first; + } + std::cerr << "\tLoaded " << m_lexicalTable.size() << " lexical pairs" << std::endl; + std::cerr << std::endl; +} + +void PhraseTableCreator::CreateRankHash() +{ + InputFileStream inFile(m_inPath); + +#ifdef WITH_THREADS + boost::thread_group threads; + for (size_t i = 0; i < m_threads; ++i) + { + RankingTask* rt = new RankingTask(inFile, *this); + threads.create_thread(*rt); + } + threads.join_all(); +#else + RankingTask* rt = new RankingTask(inFile, *this); + (*rt)(); + delete rt; +#endif + FlushRankedQueue(true); +} + +inline std::string PhraseTableCreator::MakeSourceKey(std::string &source) +{ + return source + m_separator; +} + +inline std::string PhraseTableCreator::MakeSourceTargetKey(std::string &source, std::string &target) +{ + return source + m_separator + target + m_separator; +} + +void PhraseTableCreator::EncodeTargetPhrases() +{ + InputFileStream inFile(m_inPath); + +#ifdef WITH_THREADS + boost::thread_group threads; + for (size_t i = 0; i < m_threads; ++i) + { + EncodingTask* et = new EncodingTask(inFile, *this); + threads.create_thread(*et); + } + threads.join_all(); +#else + EncodingTask* et = new EncodingTask(inFile, *this); + (*et)(); + delete et; +#endif + FlushEncodedQueue(true); +} + + +void PhraseTableCreator::CompressTargetPhrases() +{ +#ifdef WITH_THREADS + boost::thread_group threads; + for (size_t i = 0; i < m_threads; ++i) { + CompressionTask* ct = new CompressionTask(m_encodedTargetPhrases, *this); + threads.create_thread(*ct); + } + threads.join_all(); +#else + CompressionTask* ct = new CompressionTask(m_encodedTargetPhrases, *this); + (*ct)(); + delete ct; +#endif + FlushCompressedQueue(true); +} + +void PhraseTableCreator::CalcHuffmanCodes() +{ + std::cerr << "\tCreating Huffman codes for " << m_symbolCounter.Size() + << " target phrase symbols" << std::endl; + + m_symbolTree = new SymbolTree(m_symbolCounter.Begin(), + m_symbolCounter.End()); + + std::vector::iterator treeIt = m_scoreTrees.begin(); + for(std::vector::iterator it = m_scoreCounters.begin(); + it != m_scoreCounters.end(); it++) + { + + if(m_quantize) + (*it)->Quantize(m_quantize); + + std::cerr << "\tCreating Huffman codes for " << (*it)->Size() + << " scores" << std::endl; + + *treeIt = new ScoreTree((*it)->Begin(), (*it)->End()); + treeIt++; + } + + if(m_useAlignmentInfo) + { + std::cerr << "\tCreating Huffman codes for " << m_alignCounter.Size() + << " alignment points" << std::endl; + m_alignTree = new AlignTree(m_alignCounter.Begin(), m_alignCounter.End()); + } + std::cerr << std::endl; +} + + +void PhraseTableCreator::AddSourceSymbolId(std::string& symbol) +{ + if(m_sourceSymbolsMap.count(symbol) == 0) { + unsigned value = m_sourceSymbolsMap.size(); + m_sourceSymbolsMap[symbol] = value; + } +} + +void PhraseTableCreator::AddTargetSymbolId(std::string& symbol) +{ + if(m_targetSymbolsMap.count(symbol) == 0) { + unsigned value = m_targetSymbolsMap.size(); + m_targetSymbolsMap[symbol] = value; + } +} + +unsigned PhraseTableCreator::GetSourceSymbolId(std::string& symbol) +{ + boost::unordered_map::iterator it + = m_sourceSymbolsMap.find(symbol); + + if(it != m_sourceSymbolsMap.end()) + return it->second; + else + return m_sourceSymbolsMap.size(); +} + +unsigned PhraseTableCreator::GetTargetSymbolId(std::string& symbol) +{ + boost::unordered_map::iterator it + = m_targetSymbolsMap.find(symbol); + + if(it != m_targetSymbolsMap.end()) + return it->second; + else + return m_targetSymbolsMap.size(); +} + +unsigned PhraseTableCreator::GetOrAddTargetSymbolId(std::string& symbol) +{ +#ifdef WITH_THREADS + boost::mutex::scoped_lock lock(m_mutex); +#endif + boost::unordered_map::iterator it + = m_targetSymbolsMap.find(symbol); + + if(it != m_targetSymbolsMap.end()) + return it->second; + else + { + unsigned value = m_targetSymbolsMap.size(); + m_targetSymbolsMap[symbol] = value; + return value; + } +} + +unsigned PhraseTableCreator::GetRank(unsigned srcIdx, unsigned trgIdx) +{ + size_t srcTrgIdx = m_lexicalTableIndex[srcIdx]; + while(srcTrgIdx < m_lexicalTable.size() + && srcIdx == m_lexicalTable[srcTrgIdx].first + && m_lexicalTable[srcTrgIdx].second != trgIdx) + srcTrgIdx++; + + if(srcTrgIdx < m_lexicalTable.size() + && m_lexicalTable[srcTrgIdx].second == trgIdx) + return srcTrgIdx - m_lexicalTableIndex[srcIdx]; + else + return m_lexicalTable.size(); +} + +unsigned PhraseTableCreator::EncodeREncSymbol1(unsigned trgIdx) +{ + assert((~(1 << 31)) > trgIdx); + return trgIdx; +} + +unsigned PhraseTableCreator::EncodeREncSymbol2(unsigned pos, unsigned rank) +{ + unsigned symbol = rank; + symbol |= 1 << 30; + symbol |= pos << 24; + return symbol; +} + +unsigned PhraseTableCreator::EncodeREncSymbol3(unsigned rank) +{ + unsigned symbol = rank; + symbol |= 2 << 30; + return symbol; +} + +unsigned PhraseTableCreator::EncodePREncSymbol1(unsigned trgIdx) +{ + assert((~(1 << 31)) > trgIdx); + return trgIdx; +} + +unsigned PhraseTableCreator::EncodePREncSymbol2(int left, int right, unsigned rank) +{ + // "left" and "right" must be smaller than 2^5 + // "rank" must be smaller than 2^19 + left = left + 32; + right = right + 32; + + assert(64 > left); + assert(64 > right); + assert(524288 > rank); + + unsigned symbol = 0; + symbol |= 1 << 31; + symbol |= left << 25; + symbol |= right << 19; + symbol |= rank; + return symbol; +} + +void PhraseTableCreator::EncodeTargetPhraseNone(std::vector& t, + std::ostream& os) +{ + std::stringstream encodedTargetPhrase; + size_t j = 0; + while(j < t.size()) + { + unsigned targetSymbolId = GetOrAddTargetSymbolId(t[j]); + + m_symbolCounter.Increase(targetSymbolId); + os.write((char*)&targetSymbolId, sizeof(targetSymbolId)); + j++; + } + + unsigned stopSymbolId = GetTargetSymbolId(m_phraseStopSymbol); + os.write((char*)&stopSymbolId, sizeof(stopSymbolId)); + m_symbolCounter.Increase(stopSymbolId); +} + +void PhraseTableCreator::EncodeTargetPhraseREnc(std::vector& s, + std::vector& t, + std::set& a, + std::ostream& os) +{ + + std::stringstream encodedTargetPhrase; + + std::vector > a2(t.size()); + for(std::set::iterator it = a.begin(); it != a.end(); it++) + a2[it->second].push_back(it->first); + + for(size_t i = 0; i < t.size(); i++) + { + unsigned idxTarget = GetOrAddTargetSymbolId(t[i]); + unsigned encodedSymbol = -1; + + unsigned bestSrcPos = s.size(); + unsigned bestDiff = s.size(); + unsigned bestRank = m_lexicalTable.size(); + unsigned badRank = m_lexicalTable.size(); + + for(std::vector::iterator it = a2[i].begin(); it != a2[i].end(); it++) + { + unsigned idxSource = GetSourceSymbolId(s[*it]); + size_t r = GetRank(idxSource, idxTarget); + if(r != badRank) + { + if(r < bestRank) + { + bestRank = r; + bestSrcPos = *it; + bestDiff = abs(*it-i); + } + else if(r == bestRank && unsigned(abs(*it-i)) < bestDiff) + { + bestSrcPos = *it; + bestDiff = abs(*it-i); + } + } + } + + if(bestRank != badRank && bestSrcPos < s.size()) + { + if(bestSrcPos == i) + encodedSymbol = EncodeREncSymbol3(bestRank); + else + encodedSymbol = EncodeREncSymbol2(bestSrcPos, bestRank); + a.erase(AlignPoint(bestSrcPos, i)); + } + else + { + encodedSymbol = EncodeREncSymbol1(idxTarget); + } + + os.write((char*)&encodedSymbol, sizeof(encodedSymbol)); + m_symbolCounter.Increase(encodedSymbol); + } + + unsigned stopSymbolId = GetTargetSymbolId(m_phraseStopSymbol); + unsigned encodedSymbol = EncodeREncSymbol1(stopSymbolId); + os.write((char*)&encodedSymbol, sizeof(encodedSymbol)); + m_symbolCounter.Increase(encodedSymbol); +} + +void PhraseTableCreator::EncodeTargetPhrasePREnc(std::vector& s, + std::vector& t, + std::set& a, + size_t ownRank, + std::ostream& os) +{ + std::vector encodedSymbols(t.size()); + std::vector encodedSymbolsLengths(t.size(), 0); + + ConsistantPhrases cp(s.size(), t.size(), a.begin(), a.end()); + while(cp.Size()) { + ConsistantPhrases::Phrase p = cp.Pop(); + + std::stringstream key1; + key1 << s[p.i]; + for(int i = p.i+1; i < p.i+p.m; i++) + key1 << " " << s[i]; + + std::stringstream key2; + key2 << t[p.j]; + for(int i = p.j+1; i < p.j+p.n; i++) + key2 << " " << t[i]; + + int rank = -1; + std::string key1Str = key1.str(), key2Str = key2.str(); + size_t idx = m_rnkHash[MakeSourceTargetKey(key1Str, key2Str)]; + if(idx != m_rnkHash.GetSize()) + rank = m_ranks[idx]; + + if(rank >= 0 && (m_maxRank == 0 || unsigned(rank) < m_maxRank)) + { + if(unsigned(p.m) != s.size() || unsigned(rank) < ownRank) + { + std::stringstream encodedSymbol; + encodedSymbols[p.j] = EncodePREncSymbol2(p.i-p.j, s.size()-(p.i+p.m), rank); + encodedSymbolsLengths[p.j] = p.n; + + std::set tAlignment; + for(std::set::iterator it = a.begin(); + it != a.end(); it++) + if(it->first < p.i || it->first >= p.i + p.m + || it->second < p.j || it->second >= p.j + p.n) + tAlignment.insert(*it); + a = tAlignment; + cp.RemoveOverlap(p); + } + } + } + + std::stringstream encodedTargetPhrase; + + size_t j = 0; + while(j < t.size()) + { + if(encodedSymbolsLengths[j] > 0) + { + unsigned encodedSymbol = encodedSymbols[j]; + m_symbolCounter.Increase(encodedSymbol); + os.write((char*)&encodedSymbol, sizeof(encodedSymbol)); + j += encodedSymbolsLengths[j]; + } + else + { + unsigned targetSymbolId = GetOrAddTargetSymbolId(t[j]); + unsigned encodedSymbol = EncodePREncSymbol1(targetSymbolId); + m_symbolCounter.Increase(encodedSymbol); + os.write((char*)&encodedSymbol, sizeof(encodedSymbol)); + j++; + } + } + + unsigned stopSymbolId = GetTargetSymbolId(m_phraseStopSymbol); + unsigned encodedSymbol = EncodePREncSymbol1(stopSymbolId); + os.write((char*)&encodedSymbol, sizeof(encodedSymbol)); + m_symbolCounter.Increase(encodedSymbol); +} + +void PhraseTableCreator::EncodeScores(std::vector& scores, std::ostream& os) +{ + size_t c = 0; + float score; + + while(c < scores.size()) + { + score = scores[c]; + score = FloorScore(TransformScore(score)); + os.write((char*)&score, sizeof(score)); + m_scoreCounters[m_multipleScoreTrees ? c : 0]->Increase(score); + c++; + } +} + +void PhraseTableCreator::EncodeAlignment(std::set& alignment, + std::ostream& os) +{ + for(std::set::iterator it = alignment.begin(); + it != alignment.end(); it++) + { + os.write((char*)&(*it), sizeof(AlignPoint)); + m_alignCounter.Increase(*it); + } + AlignPoint stop(-1, -1); + os.write((char*) &stop, sizeof(AlignPoint)); + m_alignCounter.Increase(stop); +} + +std::string PhraseTableCreator::EncodeLine(std::vector& tokens, size_t ownRank) +{ + std::string sourcePhraseStr = tokens[0]; + std::string targetPhraseStr = tokens[1]; + std::string scoresStr = tokens[2]; + std::string alignmentStr = tokens[3]; + + std::vector s = Tokenize(sourcePhraseStr); + + size_t phraseLength = s.size(); + if(m_maxPhraseLength < phraseLength) + m_maxPhraseLength = phraseLength; + + std::vector t = Tokenize(targetPhraseStr); + std::vector scores = Tokenize(scoresStr); + + std::set a; + if(m_coding != None || m_useAlignmentInfo) + { + std::vector positions = Tokenize(alignmentStr, " \t-"); + for(size_t i = 0; i < positions.size(); i += 2) + { + a.insert(AlignPoint(positions[i], positions[i+1])); + } + } + + std::stringstream encodedTargetPhrase; + + if(m_coding == PREnc) + { + EncodeTargetPhrasePREnc(s, t, a, ownRank, encodedTargetPhrase); + } + else if(m_coding == REnc) + { + EncodeTargetPhraseREnc(s, t, a, encodedTargetPhrase); + } + else + { + EncodeTargetPhraseNone(t, encodedTargetPhrase); + } + + EncodeScores(scores, encodedTargetPhrase); + + if(m_useAlignmentInfo) + EncodeAlignment(a, encodedTargetPhrase); + + return encodedTargetPhrase.str(); +} + +std::string PhraseTableCreator::CompressEncodedCollection(std::string encodedCollection) +{ + enum EncodeState { + ReadSymbol, ReadScore, ReadAlignment, + EncodeSymbol, EncodeScore, EncodeAlignment }; + EncodeState state = ReadSymbol; + + unsigned phraseStopSymbolId; + if(m_coding == REnc) + phraseStopSymbolId = EncodeREncSymbol1(GetTargetSymbolId(m_phraseStopSymbol)); + else if(m_coding == PREnc) + phraseStopSymbolId = EncodePREncSymbol1(GetTargetSymbolId(m_phraseStopSymbol)); + else + phraseStopSymbolId = GetTargetSymbolId(m_phraseStopSymbol); + AlignPoint alignStopSymbol(-1, -1); + + std::stringstream encodedStream(encodedCollection); + encodedStream.unsetf(std::ios::skipws); + + std::string output; + BitStream<> bitstream(output); + + unsigned symbol; + float score; + size_t currScore = 0; + AlignPoint alignPoint; + + while(encodedStream) + { + switch(state) + { + case ReadSymbol: + encodedStream.read((char*) &symbol, sizeof(unsigned)); + state = EncodeSymbol; + break; + case ReadScore: + if(currScore == m_numScoreComponent) + { + currScore = 0; + if(m_useAlignmentInfo) + state = ReadAlignment; + else + state = ReadSymbol; + } + else + { + encodedStream.read((char*) &score, sizeof(float)); + currScore++; + state = EncodeScore; + } + break; + case ReadAlignment: + encodedStream.read((char*) &alignPoint, sizeof(AlignPoint)); + state = EncodeAlignment; + break; + + case EncodeSymbol: + state = (symbol == phraseStopSymbolId) ? ReadScore : ReadSymbol; + bitstream.PutCode(m_symbolTree->Encode(symbol)); + break; + case EncodeScore: + { + state = ReadScore; + size_t idx = m_multipleScoreTrees ? currScore-1 : 0; + if(m_quantize) + score = m_scoreCounters[idx]->LowerBound(score); + bitstream.PutCode(m_scoreTrees[idx]->Encode(score)); + } + break; + case EncodeAlignment: + state = (alignPoint == alignStopSymbol) ? ReadSymbol : ReadAlignment; + bitstream.PutCode(m_alignTree->Encode(alignPoint)); + break; + } + } + + return output; +} + +void PhraseTableCreator::AddRankedLine(PackedItem& pi) +{ + m_queue.push(pi); +} + +void PhraseTableCreator::FlushRankedQueue(bool force) +{ + size_t step = 1ul << 10; + + while(!m_queue.empty() && m_lastFlushedLine + 1 == m_queue.top().GetLine()) + { + m_lastFlushedLine++; + + PackedItem pi = m_queue.top(); + m_queue.pop(); + + if(m_lastSourceRange.size() == step) + { + m_rnkHash.AddRange(m_lastSourceRange); + m_lastSourceRange.clear(); + } + + if(m_lastFlushedSourcePhrase != pi.GetSrc()) + { + if(m_rankQueue.size()) { + + m_lastFlushedSourceNum++; + if(m_lastFlushedSourceNum % 100000 == 0) + std::cerr << "."; + if(m_lastFlushedSourceNum % 5000000 == 0) + { + std::cerr << "[" << m_lastFlushedSourceNum << "]" << std::endl; + } + + m_ranks.resize(m_lastFlushedLine + 1); + int r = 0; + while(!m_rankQueue.empty()) { + m_ranks[m_rankQueue.top().second] = r++; + m_rankQueue.pop(); + } + } + } + + m_lastSourceRange.push_back(pi.GetTrg()); + + m_rankQueue.push(std::make_pair(pi.GetScore(), pi.GetLine())); + m_lastFlushedSourcePhrase = pi.GetSrc(); + } + + if(force) + { + m_rnkHash.AddRange(m_lastSourceRange); + m_lastSourceRange.clear(); + +#ifdef WITH_THREADS + m_rnkHash.WaitAll(); +#endif + + m_ranks.resize(m_lastFlushedLine + 1); + int r = 0; + while(!m_rankQueue.empty()) + { + m_ranks[m_rankQueue.top().second] = r++; + m_rankQueue.pop(); + } + + m_lastFlushedLine = -1; + m_lastFlushedSourceNum = 0; + + std::cerr << std::endl << std::endl; + } +} + + +void PhraseTableCreator::AddEncodedLine(PackedItem& pi) +{ + m_queue.push(pi); +} + +void PhraseTableCreator::FlushEncodedQueue(bool force) +{ + while(!m_queue.empty() && m_lastFlushedLine + 1 == m_queue.top().GetLine()) + { + PackedItem pi = m_queue.top(); + m_queue.pop(); + m_lastFlushedLine++; + + if(m_lastFlushedSourcePhrase != pi.GetSrc()) + { + if(m_lastCollection.size()) + { + std::stringstream targetPhraseCollection; + for(std::vector::iterator it = + m_lastCollection.begin(); it != m_lastCollection.end(); it++) + targetPhraseCollection << *it; + + m_lastSourceRange.push_back(MakeSourceKey(m_lastFlushedSourcePhrase)); + m_encodedTargetPhrases.push_back(targetPhraseCollection.str()); + + m_lastFlushedSourceNum++; + if(m_lastFlushedSourceNum % 100000 == 0) + std::cerr << "."; + if(m_lastFlushedSourceNum % 5000000 == 0) + std::cerr << "[" << m_lastFlushedSourceNum << "]" << std::endl; + + m_lastCollection.clear(); + } + } + + if(m_lastSourceRange.size() == (1ul << m_orderBits)) + { + m_srcHash.AddRange(m_lastSourceRange); + m_srcHash.SaveLastRange(); + m_srcHash.DropLastRange(); + m_lastSourceRange.clear(); + } + + m_lastFlushedSourcePhrase = pi.GetSrc(); + if(m_coding == PREnc) + { + if(m_lastCollection.size() <= pi.GetRank()) + m_lastCollection.resize(pi.GetRank() + 1); + m_lastCollection[pi.GetRank()] = pi.GetTrg(); + } + else + { + m_lastCollection.push_back(pi.GetTrg()); + } + } + + if(force) + { + if(m_lastCollection.size()) + { + std::stringstream targetPhraseCollection; + for(std::vector::iterator it = + m_lastCollection.begin(); it != m_lastCollection.end(); it++) + targetPhraseCollection << *it; + + m_encodedTargetPhrases.push_back(targetPhraseCollection.str()); + + m_lastCollection.clear(); + } + + m_srcHash.AddRange(m_lastSourceRange); + m_lastSourceRange.clear(); + +#ifdef WITH_THREADS + m_srcHash.WaitAll(); +#endif + + m_srcHash.SaveLastRange(); + m_srcHash.DropLastRange(); + m_srcHash.FinalizeSave(); + + m_lastFlushedLine = -1; + m_lastFlushedSourceNum = 0; + + std::cerr << std::endl << std::endl; + } +} + +void PhraseTableCreator::AddCompressedCollection(PackedItem& pi) +{ + m_queue.push(pi); +} + +void PhraseTableCreator::FlushCompressedQueue(bool force) +{ + if(force || m_queue.size() > 10000) + { + while(!m_queue.empty() && m_lastFlushedLine + 1 == m_queue.top().GetLine()) + { + PackedItem pi = m_queue.top(); + m_queue.pop(); + m_lastFlushedLine++; + + m_compressedTargetPhrases.push_back(pi.GetTrg()); + + if((pi.GetLine()+1) % 100000 == 0) + std::cerr << "."; + if((pi.GetLine()+1) % 5000000 == 0) + std::cerr << "[" << (pi.GetLine()+1) << "]" << std::endl; + + } + } + + if(force) + { + m_lastFlushedLine = -1; + std::cerr << std::endl << std::endl; + } +} + +//****************************************************************************// + +size_t RankingTask::m_lineNum = 0; +#ifdef WITH_THREADS +boost::mutex RankingTask::m_mutex; +boost::mutex RankingTask::m_fileMutex; +#endif + +RankingTask::RankingTask(InputFileStream& inFile, PhraseTableCreator& creator) + : m_inFile(inFile), m_creator(creator) {} + +void RankingTask::operator()() +{ + size_t lineNum = 0; + + std::vector lines; + size_t max_lines = 1000; + lines.reserve(max_lines); + + { +#ifdef WITH_THREADS + boost::mutex::scoped_lock lock(m_fileMutex); +#endif + std::string line; + while(lines.size() < max_lines && std::getline(m_inFile, line)) + lines.push_back(line); + lineNum = m_lineNum; + m_lineNum += lines.size(); + } + + std::vector result; + result.reserve(max_lines); + + while(lines.size()) + { + for(size_t i = 0; i < lines.size(); i++) + { + std::vector tokens; + Moses::TokenizeMultiCharSeparator(tokens, lines[i], m_creator.m_separator); + + std::vector scores = Tokenize(tokens[2]); + float sortScore = scores[2]; + + std::string key1 = m_creator.MakeSourceKey(tokens[0]); + std::string key2 = m_creator.MakeSourceTargetKey(tokens[0], tokens[1]); + + PackedItem packedItem(lineNum + i, key1, key2, 0, sortScore); + result.push_back(packedItem); + } + lines.clear(); + + { +#ifdef WITH_THREADS + boost::mutex::scoped_lock lock(m_mutex); +#endif + for(size_t i = 0; i < result.size(); i++) + m_creator.AddRankedLine(result[i]); + m_creator.FlushRankedQueue(); + } + + result.clear(); + lines.reserve(max_lines); + result.reserve(max_lines); + +#ifdef WITH_THREADS + boost::mutex::scoped_lock lock(m_fileMutex); +#endif + std::string line; + while(lines.size() < max_lines && std::getline(m_inFile, line)) + lines.push_back(line); + lineNum = m_lineNum; + m_lineNum += lines.size(); + } +} + +size_t EncodingTask::m_lineNum = 0; +#ifdef WITH_THREADS +boost::mutex EncodingTask::m_mutex; +boost::mutex EncodingTask::m_fileMutex; +#endif + +EncodingTask::EncodingTask(InputFileStream& inFile, PhraseTableCreator& creator) + : m_inFile(inFile), m_creator(creator) {} + +void EncodingTask::operator()() +{ + size_t lineNum = 0; + + std::vector lines; + size_t max_lines = 1000; + lines.reserve(max_lines); + + { +#ifdef WITH_THREADS + boost::mutex::scoped_lock lock(m_fileMutex); +#endif + std::string line; + while(lines.size() < max_lines && std::getline(m_inFile, line)) + lines.push_back(line); + lineNum = m_lineNum; + m_lineNum += lines.size(); + } + + std::vector result; + result.reserve(max_lines); + + while(lines.size()) + { + for(size_t i = 0; i < lines.size(); i++) + { + std::vector tokens; + Moses::TokenizeMultiCharSeparator(tokens, lines[i], m_creator.m_separator); + + size_t ownRank = 0; + if(m_creator.m_coding == PhraseTableCreator::PREnc) + ownRank = m_creator.m_ranks[lineNum + i]; + + std::string encodedLine = m_creator.EncodeLine(tokens, ownRank); + + PackedItem packedItem(lineNum + i, tokens[0], encodedLine, ownRank); + result.push_back(packedItem); + } + lines.clear(); + + { +#ifdef WITH_THREADS + boost::mutex::scoped_lock lock(m_mutex); +#endif + for(size_t i = 0; i < result.size(); i++) + m_creator.AddEncodedLine(result[i]); + m_creator.FlushEncodedQueue(); + } + + result.clear(); + lines.reserve(max_lines); + result.reserve(max_lines); + +#ifdef WITH_THREADS + boost::mutex::scoped_lock lock(m_fileMutex); +#endif + std::string line; + while(lines.size() < max_lines && std::getline(m_inFile, line)) + lines.push_back(line); + lineNum = m_lineNum; + m_lineNum += lines.size(); + } +} + +//****************************************************************************// + +size_t CompressionTask::m_collectionNum = 0; +#ifdef WITH_THREADS +boost::mutex CompressionTask::m_mutex; +#endif + +CompressionTask::CompressionTask(StringVector& encodedCollections, + PhraseTableCreator& creator) + : m_encodedCollections(encodedCollections), m_creator(creator) {} + +void CompressionTask::operator()() +{ + size_t collectionNum; + { +#ifdef WITH_THREADS + boost::mutex::scoped_lock lock(m_mutex); +#endif + collectionNum = m_collectionNum; + m_collectionNum++; + } + + while(collectionNum < m_encodedCollections.size()) + { + std::string collection = m_encodedCollections[collectionNum]; + std::string compressedCollection + = m_creator.CompressEncodedCollection(collection); + + std::string dummy; + PackedItem packedItem(collectionNum, dummy, compressedCollection, 0); + +#ifdef WITH_THREADS + boost::mutex::scoped_lock lock(m_mutex); +#endif + m_creator.AddCompressedCollection(packedItem); + m_creator.FlushCompressedQueue(); + + collectionNum = m_collectionNum; + m_collectionNum++; + } +} + +//****************************************************************************// + +PackedItem::PackedItem(long line, std::string sourcePhrase, + std::string packedTargetPhrase, size_t rank, + float score) + : m_line(line), m_sourcePhrase(sourcePhrase), + m_packedTargetPhrase(packedTargetPhrase), m_rank(rank), + m_score(score) {} + +long PackedItem::GetLine() const { return m_line; } + +const std::string& PackedItem::GetSrc() const { return m_sourcePhrase; } + +const std::string& PackedItem::GetTrg() const { return m_packedTargetPhrase; } + +size_t PackedItem::GetRank() const { return m_rank; } + +float PackedItem::GetScore() const { return m_score; } + +} diff --git a/moses/src/CompactPT/PhraseTableCreator.h b/moses/src/CompactPT/PhraseTableCreator.h new file mode 100644 index 000000000..77cfa85d4 --- /dev/null +++ b/moses/src/CompactPT/PhraseTableCreator.h @@ -0,0 +1,401 @@ +#ifndef moses_PhraseTableCreator_h +#define moses_PhraseTableCreator_h + +#include +#include +#include +#include +#include + +#include "InputFileStream.h" +#include "ThreadPool.h" +#include "UserMessage.h" +#include "Util.h" + +#include "CompactPT/BlockHashIndex.h" +#include "CompactPT/ConsistantPhrases.h" +#include "CompactPT/StringVector.h" +#include "CompactPT/CanonicalHuffman.h" + +namespace Moses +{ + +typedef std::pair AlignPoint; + +template +class Counter +{ + public: + typedef boost::unordered_map FreqMap; + typedef typename FreqMap::iterator iterator; + typedef typename FreqMap::mapped_type mapped_type; + typedef typename FreqMap::value_type value_type; + + private: +#ifdef WITH_THREADS + boost::mutex m_mutex; +#endif + FreqMap m_freqMap; + size_t m_maxSize; + std::vector m_bestVec; + + struct FreqSorter + { + bool operator()(const value_type& a, const value_type& b) const + { + if(a.second > b.second) + return true; + // Check impact on translation quality! + if(a.second == b.second && a.first > b.first) + return true; + return false; + } + }; + + public: + Counter() : m_maxSize(0) {} + + iterator Begin() + { + return m_freqMap.begin(); + } + + iterator End() + { + return m_freqMap.end(); + } + + void Increase(DataType data) + { +#ifdef WITH_THREADS + boost::mutex::scoped_lock lock(m_mutex); +#endif + m_freqMap[data]++; + } + + void IncreaseBy(DataType data, size_t num) + { +#ifdef WITH_THREADS + boost::mutex::scoped_lock lock(m_mutex); +#endif + m_freqMap[data] += num; + } + + mapped_type& operator[](DataType data) + { + return m_freqMap[data]; + } + + size_t Size() + { +#ifdef WITH_THREADS + boost::mutex::scoped_lock lock(m_mutex); +#endif + return m_freqMap.size(); + } + + void Quantize(size_t maxSize) + { +#ifdef WITH_THREADS + boost::mutex::scoped_lock lock(m_mutex); +#endif + m_maxSize = maxSize; + std::vector > freqVec; + freqVec.insert(freqVec.begin(), m_freqMap.begin(), m_freqMap.end()); + std::sort(freqVec.begin(), freqVec.end(), FreqSorter()); + + for(size_t i = 0; i < freqVec.size() && i < m_maxSize; i++) + m_bestVec.push_back(freqVec[i].first); + + std::sort(m_bestVec.begin(), m_bestVec.end()); + + FreqMap t_freqMap; + for(typename std::vector >::iterator it + = freqVec.begin(); it != freqVec.end(); it++) + { + DataType closest = LowerBound(it->first); + t_freqMap[closest] += it->second; + } + + m_freqMap.swap(t_freqMap); + } + + void Clear() + { +#ifdef WITH_THREADS + boost::mutex::scoped_lock lock(m_mutex); +#endif + m_freqMap.clear(); + } + + DataType LowerBound(DataType data) + { + if(m_maxSize == 0 || m_bestVec.size() == 0) + return data; + else + { + typename std::vector::iterator it + = std::lower_bound(m_bestVec.begin(), m_bestVec.end(), data); + if(it != m_bestVec.end()) + return *it; + else + return m_bestVec.back(); + } + } +}; + +class PackedItem +{ + private: + long m_line; + std::string m_sourcePhrase; + std::string m_packedTargetPhrase; + size_t m_rank; + float m_score; + + public: + PackedItem(long line, std::string sourcePhrase, + std::string packedTargetPhrase, size_t rank, + float m_score = 0); + + long GetLine() const; + const std::string& GetSrc() const; + const std::string& GetTrg() const; + size_t GetRank() const; + float GetScore() const; +}; + +static bool operator<(const PackedItem &pi1, const PackedItem &pi2) { + if(pi1.GetLine() < pi2.GetLine()) + return false; + return true; +} + +class PhraseTableCreator +{ + public: + enum Coding { None, REnc, PREnc }; + + private: + std::string m_inPath; + std::string m_outPath; + + std::FILE* m_outFile; + + size_t m_numScoreComponent; + Coding m_coding; + size_t m_orderBits; + size_t m_fingerPrintBits; + bool m_useAlignmentInfo; + bool m_multipleScoreTrees; + size_t m_quantize; + size_t m_maxRank; + + static std::string m_phraseStopSymbol; + static std::string m_separator; + +#ifdef WITH_THREADS + size_t m_threads; + boost::mutex m_mutex; +#endif + + BlockHashIndex m_srcHash; + BlockHashIndex m_rnkHash; + + size_t m_maxPhraseLength; + + std::vector m_ranks; + + typedef std::pair SrcTrg; + typedef std::pair SrcTrgString; + typedef std::pair SrcTrgProb; + + struct SrcTrgProbSorter + { + bool operator()(const SrcTrgProb& a, const SrcTrgProb& b) const + { + if(a.first.first < b.first.first) + return true; + + if(a.first.first == b.first.first && a.second > b.second) + return true; + + if(a.first.first == b.first.first + && a.second == b.second + && a.first.second < b.first.second) + return true; + + return false; + } + }; + + std::vector m_lexicalTableIndex; + std::vector m_lexicalTable; + + StringVector + m_encodedTargetPhrases; + + StringVector + m_compressedTargetPhrases; + + boost::unordered_map m_targetSymbolsMap; + boost::unordered_map m_sourceSymbolsMap; + + typedef Counter SymbolCounter; + typedef Counter ScoreCounter; + typedef Counter AlignCounter; + + typedef CanonicalHuffman SymbolTree; + typedef CanonicalHuffman ScoreTree; + typedef CanonicalHuffman AlignTree; + + SymbolCounter m_symbolCounter; + SymbolTree* m_symbolTree; + + AlignCounter m_alignCounter; + AlignTree* m_alignTree; + + std::vector m_scoreCounters; + std::vector m_scoreTrees; + + std::priority_queue m_queue; + long m_lastFlushedLine; + long m_lastFlushedSourceNum; + std::string m_lastFlushedSourcePhrase; + std::vector m_lastSourceRange; + std::priority_queue > m_rankQueue; + std::vector m_lastCollection; + + void Save(); + void PrintInfo(); + + void AddSourceSymbolId(std::string& symbol); + unsigned GetSourceSymbolId(std::string& symbol); + + void AddTargetSymbolId(std::string& symbol); + unsigned GetTargetSymbolId(std::string& symbol); + unsigned GetOrAddTargetSymbolId(std::string& symbol); + + unsigned GetRank(unsigned srcIdx, unsigned trgIdx); + + unsigned EncodeREncSymbol1(unsigned symbol); + unsigned EncodeREncSymbol2(unsigned position, unsigned rank); + unsigned EncodeREncSymbol3(unsigned rank); + + unsigned EncodePREncSymbol1(unsigned symbol); + unsigned EncodePREncSymbol2(int lOff, int rOff, unsigned rank); + + void EncodeTargetPhraseNone(std::vector& t, + std::ostream& os); + + void EncodeTargetPhraseREnc(std::vector& s, + std::vector& t, + std::set& a, + std::ostream& os); + + void EncodeTargetPhrasePREnc(std::vector& s, + std::vector& t, + std::set& a, size_t ownRank, + std::ostream& os); + + void EncodeScores(std::vector& scores, std::ostream& os); + void EncodeAlignment(std::set& alignment, std::ostream& os); + + std::string MakeSourceKey(std::string&); + std::string MakeSourceTargetKey(std::string&, std::string&); + + void LoadLexicalTable(std::string filePath); + + void CreateRankHash(); + void EncodeTargetPhrases(); + void CalcHuffmanCodes(); + void CompressTargetPhrases(); + + void AddRankedLine(PackedItem& pi); + void FlushRankedQueue(bool force = false); + + std::string EncodeLine(std::vector& tokens, size_t ownRank); + void AddEncodedLine(PackedItem& pi); + void FlushEncodedQueue(bool force = false); + + std::string CompressEncodedCollection(std::string encodedCollection); + void AddCompressedCollection(PackedItem& pi); + void FlushCompressedQueue(bool force = false); + + public: + + PhraseTableCreator(std::string inPath, + std::string outPath, + size_t numScoreComponent = 5, + Coding coding = PREnc, + size_t orderBits = 10, + size_t fingerPrintBits = 16, + bool useAlignmentInfo = false, + bool multipleScoreTrees = true, + size_t quantize = 0, + size_t maxRank = 100 +#ifdef WITH_THREADS + , size_t threads = 2 +#endif + ); + + friend class RankingTask; + friend class EncodingTask; + friend class CompressionTask; +}; + +class RankingTask +{ + private: +#ifdef WITH_THREADS + static boost::mutex m_mutex; + static boost::mutex m_fileMutex; +#endif + static size_t m_lineNum; + InputFileStream& m_inFile; + PhraseTableCreator& m_creator; + + public: + RankingTask(InputFileStream& inFile, PhraseTableCreator& creator); + void operator()(); +}; + +class EncodingTask +{ + private: +#ifdef WITH_THREADS + static boost::mutex m_mutex; + static boost::mutex m_fileMutex; +#endif + static size_t m_lineNum; + static size_t m_sourcePhraseNum; + static std::string m_lastSourcePhrase; + + InputFileStream& m_inFile; + PhraseTableCreator& m_creator; + + public: + EncodingTask(InputFileStream& inFile, PhraseTableCreator& creator); + void operator()(); +}; + +class CompressionTask +{ + private: +#ifdef WITH_THREADS + static boost::mutex m_mutex; +#endif + static size_t m_collectionNum; + StringVector& + m_encodedCollections; + PhraseTableCreator& m_creator; + + public: + CompressionTask(StringVector& + encodedCollections, PhraseTableCreator& creator); + void operator()(); +}; + +} + +#endif \ No newline at end of file diff --git a/moses/src/CompactPT/StringVector.h b/moses/src/CompactPT/StringVector.h new file mode 100644 index 000000000..76c5dd658 --- /dev/null +++ b/moses/src/CompactPT/StringVector.h @@ -0,0 +1,600 @@ +#ifndef moses_StringVector_h +#define moses_StringVector_h + +#include +#include +#include +#include +#include +#include + +#include + +#include "MonotonicVector.h" +#include "MmapAllocator.h" + +namespace Moses +{ + +// ********** ValueIteratorRange ********** + +template +class ValueIteratorRange +{ + private: + ValueIteratorT m_begin; + ValueIteratorT m_end; + + public: + ValueIteratorRange(ValueIteratorT begin, ValueIteratorT end); + + const ValueIteratorT& begin() const; + const ValueIteratorT& end() const; + const std::string str() const; + operator const std::string() + { + return str(); + } + + size_t size() + { + return std::distance(m_begin, m_end); + } + + template + bool operator==(const StringT& o) const; + bool operator==(const char* c) const; + + template + bool operator<(const StringT& o) const; + bool operator<(const char* c) const; +}; + +// ********** StringVector ********** + +template class Allocator = std::allocator> +class StringVector +{ + protected: + std::vector > m_charArray; + MonotonicVector m_positions; + bool m_sorted; + bool m_memoryMapped; + + virtual const ValueT* value_ptr(PosT i) const; + + public: + typedef ValueIteratorRange >::const_iterator> range; + + // ********** RangeIterator ********** + + class RangeIterator : public boost::iterator_facade + { + + private: + PosT m_index; + StringVector* m_container; + + public: + RangeIterator(); + RangeIterator(StringVector &sv, PosT index=0); + + PosT get_index(); + + private: + friend class boost::iterator_core_access; + + range dereference() const; + bool equal(RangeIterator const& other) const; + void increment(); + void decrement(); + void advance(PosT n); + + PosT distance_to(RangeIterator const& other) const; + }; + + // ********** StringIterator ********** + + class StringIterator : public boost::iterator_facade + { + + private: + PosT m_index; + StringVector* m_container; + + public: + StringIterator(); + StringIterator(StringVector &sv, PosT index=0); + + PosT get_index(); + + private: + friend class boost::iterator_core_access; + + const std::string dereference() const; + bool equal(StringIterator const& other) const; + void increment(); + void decrement(); + void advance(PosT n); + PosT distance_to(StringIterator const& other) const; + }; + + typedef RangeIterator iterator; + typedef StringIterator string_iterator; + + StringVector(); + + void swap(StringVector &c) + { + m_positions.commit(); + m_positions.swap(c.m_positions); + m_charArray.swap(c.m_charArray); + + bool temp = m_sorted; + m_sorted = c.m_sorted; + c.m_sorted = temp; + } + + bool is_sorted() const; + PosT size() const; + virtual PosT size2() const; + + template Iterator begin() const; + template Iterator end() const; + + iterator begin() const; + iterator end() const; + + PosT length(PosT i) const; + typename std::vector >::const_iterator begin(PosT i) const; + typename std::vector >::const_iterator end(PosT i) const; + + void clear() + { + m_charArray.clear(); + m_sorted = true; + m_positions = MonotonicVector(); + } + + range at(PosT i) const; + range operator[](PosT i) const; + range back() const; + + template + void push_back(StringT s); + void push_back(const char* c); + + template + PosT find(StringT &s) const; + PosT find(const char* c) const; + + virtual size_t load(std::FILE* in, bool memoryMapped = false) + { + size_t size = 0; + m_memoryMapped = memoryMapped; + + size += std::fread(&m_sorted, sizeof(bool), 1, in) * sizeof(bool); + size += m_positions.load(in, m_memoryMapped); + + size += loadCharArray(m_charArray, in, m_memoryMapped); + return size; + } + + size_t loadCharArray(std::vector >& c, + std::FILE* in, bool map = false) + { + // Can only be read into memory. Mapping not possible with std:allocator. + assert(map == false); + + size_t byteSize = 0; + + size_t valSize; + byteSize += std::fread(&valSize, sizeof(size_t), 1, in) * sizeof(size_t); + + c.resize(valSize, 0); + byteSize += std::fread(&c[0], sizeof(ValueT), valSize, in) * sizeof(ValueT); + + return byteSize; + } + + size_t loadCharArray(std::vector >& c, + std::FILE* in, bool map = false) + { + size_t byteSize = 0; + + size_t valSize; + byteSize += std::fread(&valSize, sizeof(size_t), 1, in) * sizeof(size_t); + + if(map == false) + { + // Read data into temporary file (default constructor of MmapAllocator) + // and map memory onto temporary file. Can be resized. + + c.resize(valSize, 0); + byteSize += std::fread(&c[0], sizeof(ValueT), valSize, in) * sizeof(ValueT); + } + else + { + // Map it directly on specified region of file "in" starting at valPos + // with length valSize * sizeof(ValueT). Mapped region cannot be resized. + + size_t valPos = std::ftell(in); + Allocator alloc(in, valPos); + std::vector > charArrayTemp(alloc); + charArrayTemp.resize(valSize); + c.swap(charArrayTemp); + + byteSize += valSize * sizeof(ValueT); + } + + return byteSize; + } + + size_t load(std::string filename, bool memoryMapped = false) + { + std::FILE* pFile = fopen(filename.c_str(), "r"); + size_t byteSize = load(pFile, memoryMapped); + fclose(pFile); + return byteSize; + } + + size_t save(std::FILE* out) + { + size_t byteSize = 0; + byteSize += std::fwrite(&m_sorted, sizeof(bool), 1, out) * sizeof(bool); + + byteSize += m_positions.save(out); + + size_t valSize = size2(); + byteSize += std::fwrite(&valSize, sizeof(size_t), 1, out) * sizeof(size_t); + byteSize += std::fwrite(&m_charArray[0], sizeof(ValueT), valSize, out) * sizeof(ValueT); + + return byteSize; + } + + size_t save(std::string filename) + { + std::FILE* pFile = fopen(filename.c_str(), "w"); + size_t byteSize = save(pFile); + fclose(pFile); + return byteSize; + } + +}; + +// ********** Implementation ********** + +// ValueIteratorRange + +template +ValueIteratorRange::ValueIteratorRange(ValueIteratorT begin, + ValueIteratorT end) : m_begin(begin), m_end(end) { } + +template +const ValueIteratorT& ValueIteratorRange::begin() const +{ + return m_begin; +} + +template +const ValueIteratorT& ValueIteratorRange::end() const +{ + return m_end; +} + +template +const std::string ValueIteratorRange::str() const +{ + std::string dummy; + for(ValueIteratorT it = m_begin; it != m_end; it++) + dummy.push_back(*it); + return dummy; +} + +template +template +bool ValueIteratorRange::operator==(const StringT& o) const +{ + if(std::distance(m_begin, m_end) == std::distance(o.begin(), o.end())) + return std::equal(m_begin, m_end, o.begin()); + else + return false; +} + +template +bool ValueIteratorRange::operator==(const char* c) const +{ + return *this == std::string(c); +} + +template +template +bool ValueIteratorRange::operator<(const StringT &s2) const +{ + return std::lexicographical_compare(m_begin, m_end, s2.begin(), s2.end(), + std::less::value_type>()); +} + +template +bool ValueIteratorRange::operator<(const char* c) const +{ + return *this < std::string(c); +} + +template +bool operator<(const StringT &s1, const ValueIteratorRange &s2) +{ + return std::lexicographical_compare(s1.begin(), s1.end(), s2.begin(), s2.end(), + std::less::value_type>()); +} + +template +bool operator<(const char* c, const ValueIteratorRange &s2) +{ + size_t len = std::char_traits::length(c); + return std::lexicographical_compare(c, c + len, s2.begin(), s2.end(), + std::less::value_type>()); +} + +template +OStream& operator<<(OStream &os, ValueIteratorRange cr) +{ + ValueIteratorT it = cr.begin(); + while(it != cr.end()) + os << *(it++); + return os; +} + +// StringVector + +template class Allocator> +StringVector::StringVector() + : m_sorted(true), m_memoryMapped(false) { } + +template class Allocator> +template +void StringVector::push_back(StringT s) +{ + if(is_sorted() && size() && !(back() < s)) + m_sorted = false; + + m_positions.push_back(size2()); + std::copy(s.begin(), s.end(), std::back_inserter(m_charArray)); +} + +template class Allocator> +void StringVector::push_back(const char* c) +{ + std::string dummy(c); + push_back(dummy); +} + +template class Allocator> +template +Iterator StringVector::begin() const +{ + return Iterator(const_cast&>(*this), 0); +} + +template class Allocator> +template +Iterator StringVector::end() const +{ + return Iterator(const_cast&>(*this), size()); +} + +template class Allocator> +typename StringVector::iterator StringVector::begin() const +{ + return begin(); +}; + +template class Allocator> +typename StringVector::iterator StringVector::end() const +{ + return end(); +}; + +template class Allocator> +bool StringVector::is_sorted() const +{ + return m_sorted; +} + +template class Allocator> +PosT StringVector::size() const +{ + return m_positions.size(); +} + +template class Allocator> +PosT StringVector::size2() const +{ + return m_charArray.size(); +} + +template class Allocator> +typename StringVector::range StringVector::at(PosT i) const +{ + return range(begin(i), end(i)); +} + +template class Allocator> +typename StringVector::range StringVector::operator[](PosT i) const +{ + return at(i); +} + +template class Allocator> +typename StringVector::range StringVector::back() const +{ + return at(size()-1); +} + +template class Allocator> +PosT StringVector::length(PosT i) const +{ + if(i+1 < size()) + return m_positions[i+1] - m_positions[i]; + else + return size2() - m_positions[i]; +} + +template class Allocator> +const ValueT* StringVector::value_ptr(PosT i) const +{ + return &m_charArray[m_positions[i]]; +} + +template class Allocator> +typename std::vector >::const_iterator StringVector::begin(PosT i) const +{ + return typename std::vector >::const_iterator(value_ptr(i)); +} + +template class Allocator> +typename std::vector >::const_iterator StringVector::end(PosT i) const +{ + return typename std::vector >::const_iterator(value_ptr(i) + length(i)); +} + +template class Allocator> +template +PosT StringVector::find(StringT &s) const +{ + if(m_sorted) + return std::distance(begin(), std::lower_bound(begin(), end(), s)); + return std::distance(begin(), std::find(begin(), end(), s)); +} + +template class Allocator> +PosT StringVector::find(const char* c) const +{ + std::string s(c); + return find(s); +} + +// RangeIterator + +template class Allocator> +StringVector::RangeIterator::RangeIterator() : m_index(0), m_container(0) { } + +template class Allocator> +StringVector::RangeIterator::RangeIterator(StringVector &sv, PosT index) + : m_index(index), m_container(&sv) { } + +template class Allocator> +PosT StringVector::RangeIterator::get_index() +{ + return m_index; +} + +template class Allocator> +typename StringVector::range + StringVector::RangeIterator::dereference() const +{ + return typename StringVector::range( + m_container->begin(m_index), + m_container->end(m_index) + ); +} + +template class Allocator> +bool StringVector::RangeIterator::equal( + StringVector::RangeIterator const& other) const +{ + return m_index == other.m_index && m_container == other.m_container; +} + +template class Allocator> +void StringVector::RangeIterator::increment() +{ + m_index++; +} + +template class Allocator> +void StringVector::RangeIterator::decrement() +{ + m_index--; +} + +template class Allocator> +void StringVector::RangeIterator::advance(PosT n) +{ + m_index += n; +} + +template class Allocator> +PosT StringVector::RangeIterator::distance_to( + StringVector::RangeIterator const& other) const +{ + return other.m_index - m_index; +} + +// StringIterator + +template class Allocator> +StringVector::StringIterator::StringIterator() + : m_index(0), m_container(0) { } + +template class Allocator> +StringVector::StringIterator::StringIterator( + StringVector &sv, PosT index) : m_index(index), + m_container(&sv) { } + +template class Allocator> +PosT StringVector::StringIterator::get_index() +{ + return m_index; +} + +template class Allocator> +const std::string StringVector::StringIterator::dereference() const +{ + return StringVector::range(m_container->begin(m_index), + m_container->end(m_index)).str(); +} + +template class Allocator> +bool StringVector::StringIterator::equal( + StringVector::StringIterator const& other) const +{ + return m_index == other.m_index && m_container == other.m_container; +} + +template class Allocator> +void StringVector::StringIterator::increment() +{ + m_index++; +} + +template class Allocator> +void StringVector::StringIterator::decrement() +{ + m_index--; +} + +template class Allocator> +void StringVector::StringIterator::advance(PosT n) +{ + m_index += n; +} + +template class Allocator> +PosT StringVector::StringIterator::distance_to( + StringVector::StringIterator const& other) const +{ + return other.m_index - m_index; +} + +// ********** Some typedefs ********** + +typedef StringVector MediumStringVector; +typedef StringVector LongStringVector; + +} + +#endif diff --git a/moses/src/CompactPT/TargetPhraseCollectionCache.h b/moses/src/CompactPT/TargetPhraseCollectionCache.h new file mode 100644 index 000000000..7449f779b --- /dev/null +++ b/moses/src/CompactPT/TargetPhraseCollectionCache.h @@ -0,0 +1,161 @@ +#ifndef moses_TargetPhraseCollectionCache_h +#define moses_TargetPhraseCollectionCache_h + +#include +#include +#include + +#ifdef WITH_THREADS +#ifdef BOOST_HAS_PTHREADS +#include +#endif +#endif + +#include + +#include "Phrase.h" +#include "TargetPhraseCollection.h" + +namespace Moses +{ + +// Avoid using new due to locking +typedef std::vector TargetPhraseVector; +typedef boost::shared_ptr TargetPhraseVectorPtr; + +class TargetPhraseCollectionCache +{ + private: + size_t m_max; + float m_tolerance; + + struct LastUsed { + clock_t m_clock; + TargetPhraseVectorPtr m_tpv; + size_t m_bitsLeft; + + LastUsed() : m_clock(0), m_bitsLeft(0) {} + + LastUsed(clock_t clock, TargetPhraseVectorPtr tpv, size_t bitsLeft = 0) + : m_clock(clock), m_tpv(tpv), m_bitsLeft(bitsLeft) {} + }; + + typedef std::map CacheMap; + + CacheMap m_phraseCache; + +#ifdef WITH_THREADS + boost::mutex m_mutex; +#endif + + public: + + typedef CacheMap::iterator iterator; + typedef CacheMap::const_iterator const_iterator; + + TargetPhraseCollectionCache(size_t max = 5000, float tolerance = 0.2) + : m_max(max), m_tolerance(tolerance) + {} + + iterator Begin() + { + return m_phraseCache.begin(); + } + + const_iterator Begin() const + { + return m_phraseCache.begin(); + } + + iterator End() + { + return m_phraseCache.end(); + } + + const_iterator End() const + { + return m_phraseCache.end(); + } + + void Cache(const Phrase &sourcePhrase, TargetPhraseVectorPtr tpv, + size_t bitsLeft = 0, size_t maxRank = 0) + { +#ifdef WITH_THREADS + boost::mutex::scoped_lock lock(m_mutex); +#endif + + iterator it = m_phraseCache.find(sourcePhrase); + if(it != m_phraseCache.end()) + it->second.m_clock = clock(); + else + { + if(maxRank && tpv->size() > maxRank) + { + TargetPhraseVectorPtr tpv_temp(new TargetPhraseVector()); + tpv_temp->resize(maxRank); + std::copy(tpv->begin(), tpv->begin() + maxRank, tpv_temp->begin()); + m_phraseCache[sourcePhrase] = LastUsed(clock(), tpv_temp, bitsLeft); + } + else + m_phraseCache[sourcePhrase] = LastUsed(clock(), tpv, bitsLeft); + } + } + + std::pair Retrieve(const Phrase &sourcePhrase) + { +#ifdef WITH_THREADS + boost::mutex::scoped_lock lock(m_mutex); +#endif + + iterator it = m_phraseCache.find(sourcePhrase); + if(it != m_phraseCache.end()) + { + LastUsed &lu = it->second; + lu.m_clock = clock(); + return std::make_pair(lu.m_tpv, lu.m_bitsLeft); + } + else + return std::make_pair(TargetPhraseVectorPtr(), 0); + } + + void Prune() + { +#ifdef WITH_THREADS + boost::mutex::scoped_lock lock(m_mutex); +#endif + + if(m_phraseCache.size() > m_max * (1 + m_tolerance)) + { + typedef std::set > Cands; + Cands cands; + for(CacheMap::iterator it = m_phraseCache.begin(); + it != m_phraseCache.end(); it++) + { + LastUsed &lu = it->second; + cands.insert(std::make_pair(lu.m_clock, it->first)); + } + + for(Cands::iterator it = cands.begin(); it != cands.end(); it++) + { + const Phrase& p = it->second; + m_phraseCache.erase(p); + + if(m_phraseCache.size() < (m_max * (1 - m_tolerance))) + break; + } + } + } + + void CleanUp() + { +#ifdef WITH_THREADS + boost::mutex::scoped_lock lock(m_mutex); +#endif + m_phraseCache.clear(); + } + +}; + +} + +#endif \ No newline at end of file