mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-12-26 05:14:36 +03:00
Adding source file for compact phrase table
This commit is contained in:
parent
80d8592ef8
commit
004cc0d3b1
307
moses/src/CompactPT/BlockHashIndex.cpp
Normal file
307
moses/src/CompactPT/BlockHashIndex.cpp
Normal file
@ -0,0 +1,307 @@
|
||||
#include "BlockHashIndex.h"
|
||||
|
||||
namespace Moses
|
||||
{
|
||||
#ifdef WITH_THREADS
|
||||
BlockHashIndex::BlockHashIndex(size_t orderBits, size_t fingerPrintBits,
|
||||
size_t threadsNum)
|
||||
: m_orderBits(orderBits), m_fingerPrintBits(fingerPrintBits),
|
||||
m_fileHandle(0), m_fileHandleStart(0), m_algo(CMPH_CHD), m_size(0),
|
||||
m_lastSaved(-1), m_lastDropped(-1), m_numLoadedRanges(0),
|
||||
m_threadPool(threadsNum) {}
|
||||
|
||||
BlockHashIndex::BlockHashIndex(size_t orderBits, size_t fingerPrintBits,
|
||||
CMPH_ALGO algo, size_t threadsNum)
|
||||
: m_orderBits(orderBits), m_fingerPrintBits(fingerPrintBits),
|
||||
m_fileHandle(0), m_fileHandleStart(0), m_algo(algo), m_size(0),
|
||||
m_lastSaved(-1), m_lastDropped(-1), m_numLoadedRanges(0),
|
||||
m_threadPool(threadsNum) {}
|
||||
#else
|
||||
BlockHashIndex::BlockHashIndex(size_t orderBits, size_t fingerPrintBits)
|
||||
: m_orderBits(orderBits), m_fingerPrintBits(fingerPrintBits),
|
||||
m_fileHandle(0), m_fileHandleStart(0), m_algo(CMPH_CHD), m_size(0),
|
||||
m_lastSaved(-1), m_lastDropped(-1), m_numLoadedRanges(0) {}
|
||||
|
||||
BlockHashIndex::BlockHashIndex(size_t orderBits, size_t fingerPrintBits, CMPH_ALGO algo)
|
||||
: m_orderBits(orderBits), m_fingerPrintBits(fingerPrintBits),
|
||||
m_fileHandle(0), m_fileHandleStart(0), m_algo(algo), m_size(0),
|
||||
m_lastSaved(-1), m_lastDropped(-1), m_numLoadedRanges(0) {}
|
||||
#endif
|
||||
|
||||
BlockHashIndex::~BlockHashIndex()
|
||||
{
|
||||
for(std::vector<cmph_t*>::iterator it = m_hashes.begin();
|
||||
it != m_hashes.end(); it++)
|
||||
if(*it != 0)
|
||||
cmph_destroy(*it);
|
||||
|
||||
for(std::vector<PairedPackedArray<>*>::iterator it = m_arrays.begin();
|
||||
it != m_arrays.end(); it++)
|
||||
if(*it != 0)
|
||||
delete *it;
|
||||
}
|
||||
|
||||
size_t BlockHashIndex::GetHash(const char* key)
|
||||
{
|
||||
std::string keyStr(key);
|
||||
size_t i = std::distance(m_landmarks.begin(),
|
||||
std::upper_bound(m_landmarks.begin(),
|
||||
m_landmarks.end(), keyStr)) - 1;
|
||||
|
||||
if(i == 0ul-1)
|
||||
return GetSize();
|
||||
|
||||
size_t pos = GetHash(i, key);
|
||||
if(pos != GetSize())
|
||||
return (1ul << m_orderBits) * i + pos;
|
||||
else
|
||||
return GetSize();
|
||||
}
|
||||
|
||||
size_t BlockHashIndex::GetFprint(const char* key) const
|
||||
{
|
||||
size_t hash;
|
||||
MurmurHash3_x86_32(key, std::strlen(key), 100000, &hash);
|
||||
hash &= (1ul << m_fingerPrintBits) - 1;
|
||||
return hash;
|
||||
}
|
||||
|
||||
size_t BlockHashIndex::GetHash(size_t i, const char* key)
|
||||
{
|
||||
if(m_hashes[i] == 0)
|
||||
LoadRange(i);
|
||||
|
||||
size_t idx = cmph_search(m_hashes[i], key, (cmph_uint32) strlen(key));
|
||||
|
||||
std::pair<size_t, size_t> orderPrint = m_arrays[i]->Get(idx, m_orderBits, m_fingerPrintBits);
|
||||
m_clocks[i] = clock();
|
||||
|
||||
if(GetFprint(key) == orderPrint.second)
|
||||
return orderPrint.first;
|
||||
else
|
||||
return GetSize();
|
||||
}
|
||||
|
||||
size_t BlockHashIndex::GetHash(std::string key)
|
||||
{
|
||||
return GetHash(key.c_str());
|
||||
}
|
||||
|
||||
size_t BlockHashIndex::operator[](std::string key)
|
||||
{
|
||||
return GetHash(key);
|
||||
}
|
||||
|
||||
size_t BlockHashIndex::operator[](char* key)
|
||||
{
|
||||
return GetHash(key);
|
||||
}
|
||||
|
||||
size_t BlockHashIndex::Save(std::string filename)
|
||||
{
|
||||
std::FILE* mphf = std::fopen(filename.c_str(), "w");
|
||||
size_t size = Save(mphf);
|
||||
std::fclose(mphf);
|
||||
return size;
|
||||
}
|
||||
|
||||
void BlockHashIndex::BeginSave(std::FILE * mphf)
|
||||
{
|
||||
m_fileHandle = mphf;
|
||||
std::fwrite(&m_orderBits, sizeof(size_t), 1, m_fileHandle);
|
||||
std::fwrite(&m_fingerPrintBits, sizeof(size_t), 1, m_fileHandle);
|
||||
|
||||
m_fileHandleStart = std::ftell(m_fileHandle);
|
||||
|
||||
size_t relIndexPos = 0;
|
||||
std::fwrite(&relIndexPos, sizeof(size_t), 1, m_fileHandle);
|
||||
}
|
||||
|
||||
void BlockHashIndex::SaveRange(size_t i)
|
||||
{
|
||||
if(m_seekIndex.size() <= i)
|
||||
m_seekIndex.resize(i+1);
|
||||
m_seekIndex[i] = std::ftell(m_fileHandle) - m_fileHandleStart;
|
||||
cmph_dump(m_hashes[i], m_fileHandle);
|
||||
m_arrays[i]->Save(m_fileHandle);
|
||||
}
|
||||
|
||||
void BlockHashIndex::SaveLastRange()
|
||||
{
|
||||
#ifdef WITH_THREADS
|
||||
boost::mutex::scoped_lock lock(m_mutex);
|
||||
#endif
|
||||
|
||||
while(!m_queue.empty() && m_lastSaved + 1 == -m_queue.top())
|
||||
{
|
||||
size_t current = -m_queue.top();
|
||||
m_queue.pop();
|
||||
SaveRange(current);
|
||||
m_lastSaved = current;
|
||||
}
|
||||
}
|
||||
|
||||
void BlockHashIndex::DropRange(size_t i)
|
||||
{
|
||||
if(m_hashes[i] != 0)
|
||||
{
|
||||
cmph_destroy(m_hashes[i]);
|
||||
m_hashes[i] = 0;
|
||||
}
|
||||
if(m_arrays[i] != 0)
|
||||
{
|
||||
delete m_arrays[i];
|
||||
m_arrays[i] = 0;
|
||||
m_clocks[i] = 0;
|
||||
}
|
||||
m_numLoadedRanges--;
|
||||
}
|
||||
|
||||
void BlockHashIndex::DropLastRange()
|
||||
{
|
||||
#ifdef WITH_THREADS
|
||||
boost::mutex::scoped_lock lock(m_mutex);
|
||||
#endif
|
||||
|
||||
while(m_lastDropped != m_lastSaved)
|
||||
DropRange(++m_lastDropped);
|
||||
}
|
||||
|
||||
#ifdef WITH_THREADS
|
||||
void BlockHashIndex::WaitAll()
|
||||
{
|
||||
m_threadPool.Stop(true);
|
||||
}
|
||||
#endif
|
||||
|
||||
size_t BlockHashIndex::FinalizeSave()
|
||||
{
|
||||
#ifdef WITH_THREADS
|
||||
m_threadPool.Stop(true);
|
||||
#endif
|
||||
|
||||
SaveLastRange();
|
||||
|
||||
size_t relIndexPos = std::ftell(m_fileHandle) - m_fileHandleStart;
|
||||
|
||||
std::fseek(m_fileHandle, m_fileHandleStart, SEEK_SET);
|
||||
std::fwrite(&relIndexPos, sizeof(size_t), 1, m_fileHandle);
|
||||
|
||||
std::fseek(m_fileHandle, m_fileHandleStart + relIndexPos, SEEK_SET);
|
||||
m_landmarks.save(m_fileHandle);
|
||||
|
||||
size_t seekIndexSize = m_seekIndex.size();
|
||||
std::fwrite(&seekIndexSize, sizeof(size_t), 1, m_fileHandle);
|
||||
std::fwrite(&m_seekIndex[0], sizeof(size_t), seekIndexSize, m_fileHandle);
|
||||
|
||||
std::fwrite(&m_size, sizeof(size_t), 1, m_fileHandle);
|
||||
|
||||
size_t fileHandleStop = std::ftell(m_fileHandle);
|
||||
return fileHandleStop - m_fileHandleStart + sizeof(m_orderBits)
|
||||
+ sizeof(m_fingerPrintBits);
|
||||
}
|
||||
|
||||
size_t BlockHashIndex::Save(std::FILE * mphf)
|
||||
{
|
||||
m_queue = std::priority_queue<int>();
|
||||
BeginSave(mphf);
|
||||
for(size_t i = 0; i < m_hashes.size(); i++)
|
||||
SaveRange(i);
|
||||
return FinalizeSave();
|
||||
}
|
||||
|
||||
size_t BlockHashIndex::LoadIndex(std::FILE* mphf)
|
||||
{
|
||||
m_fileHandle = mphf;
|
||||
|
||||
size_t beginning = std::ftell(mphf);
|
||||
|
||||
std::fread(&m_orderBits, sizeof(size_t), 1, mphf);
|
||||
std::fread(&m_fingerPrintBits, sizeof(size_t), 1, mphf);
|
||||
m_fileHandleStart = std::ftell(m_fileHandle);
|
||||
|
||||
size_t relIndexPos;
|
||||
std::fread(&relIndexPos, sizeof(size_t), 1, mphf);
|
||||
std::fseek(m_fileHandle, m_fileHandleStart + relIndexPos, SEEK_SET);
|
||||
|
||||
m_landmarks.load(mphf);
|
||||
|
||||
size_t seekIndexSize;
|
||||
std::fread(&seekIndexSize, sizeof(size_t), 1, m_fileHandle);
|
||||
m_seekIndex.resize(seekIndexSize);
|
||||
std::fread(&m_seekIndex[0], sizeof(size_t), seekIndexSize, m_fileHandle);
|
||||
m_hashes.resize(seekIndexSize, 0);
|
||||
m_clocks.resize(seekIndexSize, 0);
|
||||
m_arrays.resize(seekIndexSize, 0);
|
||||
|
||||
std::fread(&m_size, sizeof(size_t), 1, m_fileHandle);
|
||||
|
||||
size_t end = std::ftell(mphf);
|
||||
|
||||
return end - beginning;
|
||||
}
|
||||
|
||||
void BlockHashIndex::LoadRange(size_t i)
|
||||
{
|
||||
#ifdef WITH_THREADS
|
||||
boost::mutex::scoped_lock lock(m_mutex);
|
||||
#endif
|
||||
std::fseek(m_fileHandle, m_fileHandleStart + m_seekIndex[i], SEEK_SET);
|
||||
cmph_t* hash = cmph_load(m_fileHandle);
|
||||
m_arrays[i] = new PairedPackedArray<>(0, m_orderBits,
|
||||
m_fingerPrintBits);
|
||||
m_arrays[i]->Load(m_fileHandle);
|
||||
|
||||
m_hashes[i] = hash;
|
||||
m_clocks[i] = clock();
|
||||
|
||||
m_numLoadedRanges++;
|
||||
}
|
||||
|
||||
size_t BlockHashIndex::Load(std::string filename)
|
||||
{
|
||||
std::FILE* mphf = std::fopen(filename.c_str(), "r");
|
||||
size_t size = Load(mphf);
|
||||
std::fclose(mphf);
|
||||
return size;
|
||||
}
|
||||
|
||||
size_t BlockHashIndex::Load(std::FILE * mphf)
|
||||
{
|
||||
size_t byteSize = LoadIndex(mphf);
|
||||
size_t end = std::ftell(mphf);
|
||||
|
||||
for(size_t i = 0; i < m_seekIndex.size(); i++)
|
||||
LoadRange(i);
|
||||
std::fseek(m_fileHandle, end, SEEK_SET);
|
||||
return byteSize;
|
||||
}
|
||||
|
||||
size_t BlockHashIndex::GetSize() const
|
||||
{
|
||||
return m_size;
|
||||
}
|
||||
|
||||
void BlockHashIndex::KeepNLastRanges(float ratio, float tolerance)
|
||||
{
|
||||
#ifdef WITH_THREADS
|
||||
boost::mutex::scoped_lock lock(m_mutex);
|
||||
#endif
|
||||
|
||||
size_t n = m_hashes.size() * ratio;
|
||||
if(m_numLoadedRanges > size_t(n * (1 + tolerance)))
|
||||
{
|
||||
typedef std::vector<std::pair<clock_t, size_t> > LastLoaded;
|
||||
LastLoaded lastLoaded;
|
||||
for(size_t i = 0; i < m_hashes.size(); i++)
|
||||
if(m_hashes[i] != 0)
|
||||
lastLoaded.push_back(std::make_pair(m_clocks[i], i));
|
||||
|
||||
std::sort(lastLoaded.begin(), lastLoaded.end());
|
||||
for(LastLoaded::reverse_iterator it = lastLoaded.rbegin() + size_t(n * (1 - tolerance));
|
||||
it != lastLoaded.rend(); it++)
|
||||
DropRange(it->second);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
197
moses/src/CompactPT/BlockHashIndex.h
Normal file
197
moses/src/CompactPT/BlockHashIndex.h
Normal file
@ -0,0 +1,197 @@
|
||||
#ifndef moses_BlockHashIndex_h
|
||||
#define moses_BlockHashIndex_h
|
||||
|
||||
#include <iostream>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <queue>
|
||||
#include <cstring>
|
||||
#include <cstdio>
|
||||
|
||||
#include "cmph/src/cmph.h"
|
||||
#include "MurmurHash3.h"
|
||||
#include "StringVector.h"
|
||||
#include "CmphStringVectorAdapter.h"
|
||||
#include "PackedArray.h"
|
||||
|
||||
#ifdef WITH_THREADS
|
||||
#include "ThreadPool.h"
|
||||
#endif
|
||||
|
||||
namespace Moses
|
||||
{
|
||||
|
||||
class BlockHashIndex
|
||||
{
|
||||
private:
|
||||
std::priority_queue<int> m_queue;
|
||||
|
||||
size_t m_orderBits;
|
||||
size_t m_fingerPrintBits;
|
||||
|
||||
std::FILE* m_fileHandle;
|
||||
size_t m_fileHandleStart;
|
||||
|
||||
CMPH_ALGO m_algo;
|
||||
|
||||
StringVector<unsigned char, unsigned long> m_landmarks;
|
||||
|
||||
std::vector<cmph_t*> m_hashes;
|
||||
std::vector<clock_t> m_clocks;
|
||||
std::vector<PairedPackedArray<>*> m_arrays;
|
||||
|
||||
std::vector<size_t> m_seekIndex;
|
||||
|
||||
size_t m_size;
|
||||
int m_lastSaved;
|
||||
int m_lastDropped;
|
||||
size_t m_numLoadedRanges;
|
||||
|
||||
#ifdef WITH_THREADS
|
||||
ThreadPool m_threadPool;
|
||||
boost::mutex m_mutex;
|
||||
|
||||
template <typename Keys>
|
||||
class HashTask : public Task
|
||||
{
|
||||
public:
|
||||
HashTask(int id, BlockHashIndex& hash, Keys& keys)
|
||||
: m_id(id), m_hash(hash), m_keys(new Keys(keys)) {}
|
||||
|
||||
virtual void Run()
|
||||
{
|
||||
m_hash.CalcHash(m_id, *m_keys);
|
||||
}
|
||||
|
||||
virtual ~HashTask()
|
||||
{
|
||||
delete m_keys;
|
||||
}
|
||||
|
||||
private:
|
||||
int m_id;
|
||||
BlockHashIndex& m_hash;
|
||||
Keys* m_keys;
|
||||
};
|
||||
#endif
|
||||
|
||||
size_t GetFprint(const char* key) const;
|
||||
size_t GetHash(size_t i, const char* key);
|
||||
|
||||
public:
|
||||
#ifdef WITH_THREADS
|
||||
BlockHashIndex(size_t orderBits, size_t fingerPrintBits,
|
||||
size_t threadsNum = 2);
|
||||
BlockHashIndex(size_t orderBits, size_t fingerPrintBits, CMPH_ALGO algo,
|
||||
size_t threadsNum = 2);
|
||||
#else
|
||||
BlockHashIndex(size_t orderBits, size_t fingerPrintBits);
|
||||
BlockHashIndex(size_t orderBits, size_t fingerPrintBits, CMPH_ALGO algo);
|
||||
#endif
|
||||
|
||||
~BlockHashIndex();
|
||||
|
||||
size_t GetHash(const char* key);
|
||||
size_t GetHash(std::string key);
|
||||
|
||||
size_t operator[](std::string key);
|
||||
size_t operator[](char* key);
|
||||
|
||||
void BeginSave(std::FILE* mphf);
|
||||
void SaveRange(size_t i);
|
||||
void SaveLastRange();
|
||||
size_t FinalizeSave();
|
||||
|
||||
#ifdef WITH_THREADS
|
||||
void WaitAll();
|
||||
#endif
|
||||
|
||||
void DropRange(size_t i);
|
||||
void DropLastRange();
|
||||
|
||||
size_t LoadIndex(std::FILE* mphf);
|
||||
void LoadRange(size_t i);
|
||||
|
||||
size_t Save(std::string filename);
|
||||
size_t Save(std::FILE * mphf);
|
||||
|
||||
size_t Load(std::string filename);
|
||||
size_t Load(std::FILE * mphf);
|
||||
|
||||
size_t GetSize() const;
|
||||
|
||||
void KeepNLastRanges(float ratio = 0.1, float tolerance = 0.1);
|
||||
|
||||
template <typename Keys>
|
||||
void AddRange(Keys &keys)
|
||||
{
|
||||
size_t current = m_landmarks.size();
|
||||
m_landmarks.push_back(keys[0]);
|
||||
m_size += keys.size();
|
||||
|
||||
#ifdef WITH_THREADS
|
||||
HashTask<Keys>* ht = new HashTask<Keys>(current, *this, keys);
|
||||
m_threadPool.Submit(ht);
|
||||
#else
|
||||
CalcHash(current, keys);
|
||||
#endif
|
||||
}
|
||||
|
||||
template <typename Keys>
|
||||
void CalcHash(size_t current, Keys &keys)
|
||||
{
|
||||
cmph_io_adapter_t *source = VectorAdapter(keys);
|
||||
|
||||
cmph_config_t *config = cmph_config_new(source);
|
||||
cmph_config_set_algo(config, m_algo);
|
||||
|
||||
cmph_t* hash = cmph_new(config);
|
||||
cmph_config_destroy(config);
|
||||
|
||||
PairedPackedArray<> *pv =
|
||||
new PairedPackedArray<>(keys.size(), m_orderBits, m_fingerPrintBits);
|
||||
|
||||
size_t i = 0;
|
||||
for(typename Keys::iterator it = keys.begin(); it != keys.end(); it++)
|
||||
{
|
||||
std::string temp = *it;
|
||||
size_t fprint = GetFprint(temp.c_str());
|
||||
size_t idx = cmph_search(hash, temp.c_str(),
|
||||
(cmph_uint32) temp.size());
|
||||
|
||||
pv->Set(idx, i, fprint, m_orderBits, m_fingerPrintBits);
|
||||
i++;
|
||||
}
|
||||
|
||||
#ifdef WITH_THREADS
|
||||
boost::mutex::scoped_lock lock(m_mutex);
|
||||
#endif
|
||||
|
||||
if(m_hashes.size() <= current)
|
||||
{
|
||||
m_hashes.resize(current + 1, 0);
|
||||
m_arrays.resize(current + 1, 0);
|
||||
m_clocks.resize(current + 1, 0);
|
||||
}
|
||||
|
||||
m_hashes[current] = hash;
|
||||
m_arrays[current] = pv;
|
||||
m_clocks[current] = clock();
|
||||
m_queue.push(-current);
|
||||
}
|
||||
|
||||
cmph_io_adapter_t* VectorAdapter(std::vector<std::string>& v)
|
||||
{
|
||||
return CmphVectorAdapter(v);
|
||||
}
|
||||
|
||||
template <typename ValueT, typename PosT, template <typename> class Allocator>
|
||||
cmph_io_adapter_t* VectorAdapter(StringVector<ValueT, PosT, Allocator>& sv)
|
||||
{
|
||||
return CmphStringVectorAdapter(sv);
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
}
|
||||
#endif
|
322
moses/src/CompactPT/CanonicalHuffman.h
Normal file
322
moses/src/CompactPT/CanonicalHuffman.h
Normal file
@ -0,0 +1,322 @@
|
||||
#ifndef moses_CanonicalHuffman_h
|
||||
#define moses_CanonicalHuffman_h
|
||||
|
||||
#include <string>
|
||||
#include <algorithm>
|
||||
#include <boost/dynamic_bitset.hpp>
|
||||
#include <boost/unordered_map.hpp>
|
||||
|
||||
namespace Moses {
|
||||
|
||||
template<typename PosType, typename DataType> class Hufftree;
|
||||
|
||||
template <typename Data, typename Code = size_t>
|
||||
class CanonicalHuffman
|
||||
{
|
||||
private:
|
||||
std::vector<Data> m_symbols;
|
||||
|
||||
std::vector<Code> m_firstCodes;
|
||||
std::vector<size_t> m_lengthIndex;
|
||||
|
||||
typedef boost::unordered_map<Data, boost::dynamic_bitset<> > EncodeMap;
|
||||
EncodeMap m_encodeMap;
|
||||
|
||||
struct MinHeapSorter {
|
||||
std::vector<size_t>& m_vec;
|
||||
|
||||
MinHeapSorter(std::vector<size_t>& vec) : m_vec(vec) { }
|
||||
|
||||
bool operator()(size_t a, size_t b)
|
||||
{
|
||||
return m_vec[a] > m_vec[b];
|
||||
}
|
||||
};
|
||||
|
||||
template <class Iterator>
|
||||
void CalcLengths(Iterator begin, Iterator end, std::vector<size_t>& lengths)
|
||||
{
|
||||
size_t n = std::distance(begin, end);
|
||||
std::vector<size_t> A(2 * n, 0);
|
||||
|
||||
m_symbols.resize(n);
|
||||
size_t i = 0;
|
||||
for(Iterator it = begin; it != end; it++)
|
||||
{
|
||||
m_symbols[i] = it->first;
|
||||
|
||||
A[i] = n + i;
|
||||
A[n + i] = it->second;
|
||||
i++;
|
||||
}
|
||||
|
||||
if(n == 1)
|
||||
{
|
||||
lengths.push_back(1);
|
||||
return;
|
||||
}
|
||||
|
||||
MinHeapSorter hs(A);
|
||||
std::make_heap(A.begin(), A.begin() + n, hs);
|
||||
|
||||
size_t h = n;
|
||||
size_t m1, m2;
|
||||
while(h > 1)
|
||||
{
|
||||
m1 = A[0];
|
||||
std::pop_heap(A.begin(), A.begin() + h, hs);
|
||||
|
||||
h--;
|
||||
|
||||
m2 = A[0];
|
||||
std::pop_heap(A.begin(), A.begin() + h, hs);
|
||||
|
||||
A[h] = A[m1] + A[m2];
|
||||
A[h-1] = h;
|
||||
A[m1] = A[m2] = h;
|
||||
|
||||
std::push_heap(A.begin(), A.begin() + h, hs);
|
||||
}
|
||||
|
||||
A[1] = 0;
|
||||
for(size_t i = 2; i < 2*n; i++)
|
||||
A[i] = A[A[i]] + 1;
|
||||
|
||||
lengths.resize(n);
|
||||
for(size_t i = 0; i < n; i++)
|
||||
lengths[i] = A[i + n];
|
||||
}
|
||||
|
||||
|
||||
void CalcCodes(std::vector<size_t>& lengths)
|
||||
{
|
||||
std::vector<size_t> numLength;
|
||||
for(std::vector<size_t>::iterator it = lengths.begin();
|
||||
it != lengths.end(); it++) {
|
||||
size_t length = *it;
|
||||
if(numLength.size() <= length)
|
||||
numLength.resize(length + 1, 0);
|
||||
numLength[length]++;
|
||||
}
|
||||
|
||||
m_lengthIndex.resize(numLength.size());
|
||||
m_lengthIndex[0] = 0;
|
||||
for(size_t l = 1; l < numLength.size(); l++)
|
||||
m_lengthIndex[l] = m_lengthIndex[l - 1] + numLength[l - 1];
|
||||
|
||||
size_t maxLength = numLength.size() - 1;
|
||||
|
||||
m_firstCodes.resize(maxLength + 1, 0);
|
||||
for(size_t l = maxLength - 1; l > 0; l--)
|
||||
m_firstCodes[l] = (m_firstCodes[l + 1] + numLength[l + 1]) / 2;
|
||||
|
||||
std::vector<Data> t_symbols;
|
||||
t_symbols.resize(lengths.size());
|
||||
|
||||
std::vector<size_t> nextCode = m_firstCodes;
|
||||
for(size_t i = 0; i < lengths.size(); i++)
|
||||
{
|
||||
Data data = m_symbols[i];
|
||||
size_t length = lengths[i];
|
||||
|
||||
size_t pos = m_lengthIndex[length]
|
||||
+ (nextCode[length] - m_firstCodes[length]);
|
||||
t_symbols[pos] = data;
|
||||
|
||||
nextCode[length] = nextCode[length] + 1;
|
||||
}
|
||||
|
||||
m_symbols.swap(t_symbols);
|
||||
}
|
||||
|
||||
public:
|
||||
|
||||
CanonicalHuffman(std::FILE* pFile, bool forEncoding = false)
|
||||
{
|
||||
Load(pFile);
|
||||
|
||||
if(forEncoding)
|
||||
CreateCodeMap();
|
||||
}
|
||||
|
||||
template <class Iterator>
|
||||
CanonicalHuffman(Iterator begin, Iterator end, bool forEncoding = true)
|
||||
{
|
||||
std::vector<size_t> lengths;
|
||||
CalcLengths(begin, end, lengths);
|
||||
CalcCodes(lengths);
|
||||
|
||||
if(forEncoding)
|
||||
CreateCodeMap();
|
||||
}
|
||||
|
||||
void CreateCodeMap()
|
||||
{
|
||||
for(size_t l = 1; l < m_lengthIndex.size(); l++)
|
||||
{
|
||||
Code code = m_firstCodes[l];
|
||||
size_t num = ((l+1 < m_lengthIndex.size()) ? m_lengthIndex[l+1]
|
||||
: m_symbols.size()) - m_lengthIndex[l];
|
||||
|
||||
for(size_t i = 0; i < num; i++)
|
||||
{
|
||||
Data data = m_symbols[m_lengthIndex[l] + i];
|
||||
boost::dynamic_bitset<> bitCode(l, code);
|
||||
m_encodeMap[data] = bitCode;
|
||||
code++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
boost::dynamic_bitset<>& Encode(Data data)
|
||||
{
|
||||
return m_encodeMap[data];
|
||||
}
|
||||
|
||||
template <class BitStream>
|
||||
Data NextSymbol(BitStream& bitStream)
|
||||
{
|
||||
if(bitStream.RemainingBits())
|
||||
{
|
||||
Code code = bitStream.GetNext();
|
||||
size_t length = 1;
|
||||
while(code < m_firstCodes[length])
|
||||
{
|
||||
code = 2 * code + bitStream.GetNext();
|
||||
length++;
|
||||
}
|
||||
|
||||
size_t symbolIndex = m_lengthIndex[length]
|
||||
+ (code - m_firstCodes[length]);
|
||||
return m_symbols[symbolIndex];
|
||||
}
|
||||
return Data();
|
||||
}
|
||||
|
||||
size_t Load(std::FILE* pFile)
|
||||
{
|
||||
size_t start = std::ftell(pFile);
|
||||
|
||||
size_t size;
|
||||
std::fread(&size, sizeof(size_t), 1, pFile);
|
||||
m_symbols.resize(size);
|
||||
std::fread(&m_symbols[0], sizeof(Data), size, pFile);
|
||||
|
||||
std::fread(&size, sizeof(size_t), 1, pFile);
|
||||
m_firstCodes.resize(size);
|
||||
std::fread(&m_firstCodes[0], sizeof(Code), size, pFile);
|
||||
|
||||
std::fread(&size, sizeof(size_t), 1, pFile);
|
||||
m_lengthIndex.resize(size);
|
||||
std::fread(&m_lengthIndex[0], sizeof(size_t), size, pFile);
|
||||
|
||||
return std::ftell(pFile) - start;
|
||||
}
|
||||
|
||||
size_t Save(std::FILE* pFile)
|
||||
{
|
||||
size_t start = std::ftell(pFile);
|
||||
|
||||
size_t size = m_symbols.size();
|
||||
std::fwrite(&size, sizeof(size_t), 1, pFile);
|
||||
std::fwrite(&m_symbols[0], sizeof(Data), size, pFile);
|
||||
|
||||
size = m_firstCodes.size();
|
||||
std::fwrite(&size, sizeof(size_t), 1, pFile);
|
||||
std::fwrite(&m_firstCodes[0], sizeof(Code), size, pFile);
|
||||
|
||||
size = m_lengthIndex.size();
|
||||
std::fwrite(&size, sizeof(size_t), 1, pFile);
|
||||
std::fwrite(&m_lengthIndex[0], sizeof(size_t), size, pFile);
|
||||
|
||||
return std::ftell(pFile) - start;
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
template <class Container = std::string>
|
||||
class BitStream
|
||||
{
|
||||
private:
|
||||
Container& m_data;
|
||||
|
||||
typename Container::iterator m_iterator;
|
||||
typename Container::value_type m_currentValue;
|
||||
|
||||
size_t m_valueBits;
|
||||
typename Container::value_type m_mask;
|
||||
size_t m_bitPos;
|
||||
|
||||
public:
|
||||
|
||||
BitStream(Container &data)
|
||||
: m_data(data), m_iterator(m_data.begin()),
|
||||
m_valueBits(sizeof(typename Container::value_type) * 8),
|
||||
m_mask(1), m_bitPos(0) { }
|
||||
|
||||
size_t RemainingBits()
|
||||
{
|
||||
if(m_data.size() * m_valueBits < m_bitPos)
|
||||
return 0;
|
||||
return m_data.size() * m_valueBits - m_bitPos;
|
||||
}
|
||||
|
||||
void SetLeft(size_t bitPos)
|
||||
{
|
||||
m_bitPos = m_data.size() * m_valueBits - bitPos;
|
||||
m_iterator = m_data.begin() + int((m_bitPos-1)/m_valueBits);
|
||||
m_currentValue = (*m_iterator) >> ((m_bitPos-1) % m_valueBits);
|
||||
m_iterator++;
|
||||
}
|
||||
|
||||
bool GetNext()
|
||||
{
|
||||
if(m_bitPos % m_valueBits == 0)
|
||||
{
|
||||
if(m_iterator != m_data.end())
|
||||
{
|
||||
m_currentValue = *m_iterator++;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
m_currentValue = m_currentValue >> 1;
|
||||
}
|
||||
|
||||
m_bitPos++;
|
||||
return (m_currentValue & m_mask);
|
||||
}
|
||||
|
||||
void PutCode(boost::dynamic_bitset<> code)
|
||||
{
|
||||
|
||||
for(int j = code.size()-1; j >= 0; j--)
|
||||
{
|
||||
if(m_bitPos % m_valueBits == 0)
|
||||
{
|
||||
m_data.push_back(0);
|
||||
}
|
||||
|
||||
if(code[j])
|
||||
m_data[m_data.size()-1] |= m_mask << (m_bitPos % m_valueBits);
|
||||
|
||||
m_bitPos++;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
void Reset()
|
||||
{
|
||||
m_iterator = m_data.begin();
|
||||
m_bitPos = 0;
|
||||
}
|
||||
|
||||
Container& GetContainer()
|
||||
{
|
||||
return m_data;
|
||||
}
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
#endif
|
69
moses/src/CompactPT/CmphStringVectorAdapter.cpp
Normal file
69
moses/src/CompactPT/CmphStringVectorAdapter.cpp
Normal file
@ -0,0 +1,69 @@
|
||||
#include "CmphStringVectorAdapter.h"
|
||||
|
||||
namespace Moses
|
||||
{
|
||||
|
||||
void CmphStringVectorAdapterDispose(void *data, char *key, cmph_uint32 keylen)
|
||||
{
|
||||
delete[] key;
|
||||
}
|
||||
|
||||
void CmphStringVectorAdapterRewind(void *data)
|
||||
{
|
||||
cmph_vector_t *cmph_vector = (cmph_vector_t *)data;
|
||||
cmph_vector->position = 0;
|
||||
}
|
||||
|
||||
//************************************************************************//
|
||||
|
||||
cmph_io_adapter_t *CmphVectorAdapterNew(std::vector<std::string>& v)
|
||||
{
|
||||
cmph_io_adapter_t * key_source = (cmph_io_adapter_t *)malloc(sizeof(cmph_io_adapter_t));
|
||||
cmph_vector_t * cmph_vector = (cmph_vector_t *)malloc(sizeof(cmph_vector_t));
|
||||
assert(key_source);
|
||||
assert(cmph_vector);
|
||||
|
||||
cmph_vector->vector = (void *)&v;
|
||||
cmph_vector->position = 0;
|
||||
key_source->data = (void *)cmph_vector;
|
||||
key_source->nkeys = v.size();
|
||||
|
||||
return key_source;
|
||||
}
|
||||
|
||||
int CmphVectorAdapterRead(void *data, char **key, cmph_uint32 *keylen)
|
||||
{
|
||||
cmph_vector_t *cmph_vector = (cmph_vector_t *)data;
|
||||
std::vector<std::string>* v = (std::vector<std::string>*)cmph_vector->vector;
|
||||
size_t size;
|
||||
*keylen = (*v)[cmph_vector->position].size();
|
||||
size = *keylen;
|
||||
*key = new char[size + 1];
|
||||
std::string temp = (*v)[cmph_vector->position];
|
||||
strcpy(*key, temp.c_str());
|
||||
cmph_vector->position = cmph_vector->position + 1;
|
||||
return (int)(*keylen);
|
||||
}
|
||||
|
||||
void CmphVectorAdapterDispose(void *data, char *key, cmph_uint32 keylen)
|
||||
{
|
||||
delete[] key;
|
||||
}
|
||||
|
||||
void CmphVectorAdapterRewind(void *data)
|
||||
{
|
||||
cmph_vector_t *cmph_vector = (cmph_vector_t *)data;
|
||||
cmph_vector->position = 0;
|
||||
}
|
||||
|
||||
cmph_io_adapter_t* CmphVectorAdapter(std::vector<std::string>& v)
|
||||
{
|
||||
cmph_io_adapter_t * key_source = CmphVectorAdapterNew(v);
|
||||
|
||||
key_source->read = CmphVectorAdapterRead;
|
||||
key_source->dispose = CmphVectorAdapterDispose;
|
||||
key_source->rewind = CmphVectorAdapterRewind;
|
||||
return key_source;
|
||||
}
|
||||
|
||||
}
|
81
moses/src/CompactPT/CmphStringVectorAdapter.h
Normal file
81
moses/src/CompactPT/CmphStringVectorAdapter.h
Normal file
@ -0,0 +1,81 @@
|
||||
#ifndef moses_CmphStringVectorAdapterNew_h
|
||||
#define moses_CmphStringVectorAdapterNew_h
|
||||
|
||||
#include <cassert>
|
||||
#include <cstring>
|
||||
|
||||
#include "cmph/src/cmph.h"
|
||||
#include "StringVector.h"
|
||||
|
||||
namespace Moses
|
||||
{
|
||||
typedef struct
|
||||
{
|
||||
void *vector;
|
||||
cmph_uint32 position;
|
||||
}
|
||||
cmph_vector_t;
|
||||
|
||||
|
||||
template <typename ValueT, typename PosT, template <typename> class Allocator>
|
||||
cmph_io_adapter_t *CmphStringVectorAdapterNew(StringVector<ValueT, PosT, Allocator>& sv)
|
||||
{
|
||||
cmph_io_adapter_t * key_source = (cmph_io_adapter_t *)malloc(sizeof(cmph_io_adapter_t));
|
||||
cmph_vector_t * cmph_vector = (cmph_vector_t *)malloc(sizeof(cmph_vector_t));
|
||||
assert(key_source);
|
||||
assert(cmph_vector);
|
||||
|
||||
cmph_vector->vector = (void *)&sv;
|
||||
cmph_vector->position = 0;
|
||||
key_source->data = (void *)cmph_vector;
|
||||
key_source->nkeys = sv.size();
|
||||
|
||||
return key_source;
|
||||
}
|
||||
|
||||
template <typename ValueT, typename PosT, template <typename> class Allocator>
|
||||
int CmphStringVectorAdapterRead(void *data, char **key, cmph_uint32 *keylen)
|
||||
{
|
||||
cmph_vector_t *cmph_vector = (cmph_vector_t *)data;
|
||||
StringVector<ValueT, PosT, Allocator>* sv = (StringVector<ValueT, PosT, Allocator>*)cmph_vector->vector;
|
||||
size_t size;
|
||||
*keylen = (*sv)[cmph_vector->position].size();
|
||||
size = *keylen;
|
||||
*key = new char[size + 1];
|
||||
std::string temp = (*sv)[cmph_vector->position];
|
||||
std::strcpy(*key, temp.c_str());
|
||||
cmph_vector->position = cmph_vector->position + 1;
|
||||
return (int)(*keylen);
|
||||
}
|
||||
|
||||
void CmphStringVectorAdapterDispose(void *data, char *key, cmph_uint32 keylen);
|
||||
|
||||
void CmphStringVectorAdapterRewind(void *data);
|
||||
|
||||
template <typename ValueT, typename PosT, template <typename> class Allocator>
|
||||
cmph_io_adapter_t* CmphStringVectorAdapter(StringVector<ValueT, PosT, Allocator>& sv)
|
||||
{
|
||||
cmph_io_adapter_t * key_source = CmphStringVectorAdapterNew(sv);
|
||||
|
||||
key_source->read = CmphStringVectorAdapterRead<ValueT, PosT, Allocator>;
|
||||
key_source->dispose = CmphStringVectorAdapterDispose;
|
||||
key_source->rewind = CmphStringVectorAdapterRewind;
|
||||
return key_source;
|
||||
}
|
||||
|
||||
//************************************************************************//
|
||||
|
||||
cmph_io_adapter_t *CmphVectorAdapterNew(std::vector<std::string>& v);
|
||||
|
||||
int CmphVectorAdapterRead(void *data, char **key, cmph_uint32 *keylen);
|
||||
|
||||
void CmphVectorAdapterDispose(void *data, char *key, cmph_uint32 keylen);
|
||||
|
||||
void CmphVectorAdapterRewind(void *data);
|
||||
|
||||
cmph_io_adapter_t* CmphVectorAdapter(std::vector<std::string>& v);
|
||||
|
||||
}
|
||||
|
||||
|
||||
#endif
|
114
moses/src/CompactPT/ConsistantPhrases.h
Normal file
114
moses/src/CompactPT/ConsistantPhrases.h
Normal file
@ -0,0 +1,114 @@
|
||||
#ifndef moses_ConsistantPhrases_h
|
||||
#define moses_ConsistantPhrases_h
|
||||
|
||||
#include <set>
|
||||
|
||||
namespace Moses
|
||||
{
|
||||
|
||||
class ConsistantPhrases
|
||||
{
|
||||
public:
|
||||
struct Phrase
|
||||
{
|
||||
int i, j, m, n;
|
||||
Phrase(int i_, int m_, int j_, int n_) : i(i_), j(j_), m(m_), n(n_) { }
|
||||
};
|
||||
|
||||
struct PhraseSorter
|
||||
{
|
||||
bool operator()(Phrase a, Phrase b)
|
||||
{
|
||||
if(a.j < b.j)
|
||||
return true;
|
||||
if(a.j == b.j && a.n > b.n)
|
||||
return true;
|
||||
if(a.j == b.j && a.n == b.n && a.i < b.i)
|
||||
return true;
|
||||
if(a.j == b.j && a.n == b.n && a.i == b.i && a.m > b.m)
|
||||
return true;
|
||||
/*
|
||||
if(a.n > b.n)
|
||||
return true;
|
||||
if(a.n == b.n && a.j < b.j)
|
||||
return true;
|
||||
if(a.n == b.n && a.j == b.j && a.m > b.m)
|
||||
return true;
|
||||
if(a.n == b.n && a.j == b.j && a.m == b.m && a.i < b.i)
|
||||
return true;
|
||||
*/
|
||||
return false;
|
||||
}
|
||||
};
|
||||
|
||||
private:
|
||||
typedef std::set<Phrase, PhraseSorter> PhraseQueue;
|
||||
PhraseQueue m_phraseQueue;
|
||||
|
||||
public:
|
||||
|
||||
template <class It>
|
||||
ConsistantPhrases(int mmax, int nmax, It begin, It end)
|
||||
{
|
||||
for(int i = 0; i < mmax; i++)
|
||||
{
|
||||
for(int m = 1; m <= mmax-i; m++)
|
||||
{
|
||||
for(int j = 0; j < nmax; j++)
|
||||
{
|
||||
for(int n = 1; n <= nmax-j; n++)
|
||||
{
|
||||
bool consistant = true;
|
||||
for(It it = begin; it != end; it++)
|
||||
{
|
||||
int ip = it->first;
|
||||
int jp = it->second;
|
||||
if((i <= ip && ip < i+m) != (j <= jp && jp < j+n))
|
||||
{
|
||||
consistant = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if(consistant)
|
||||
m_phraseQueue.insert(Phrase(i, m, j, n));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
m_phraseQueue.erase(Phrase(0, mmax, 0, nmax));
|
||||
}
|
||||
|
||||
size_t Size()
|
||||
{
|
||||
return m_phraseQueue.size();
|
||||
}
|
||||
|
||||
Phrase Pop()
|
||||
{
|
||||
if(m_phraseQueue.size())
|
||||
{
|
||||
Phrase p = *m_phraseQueue.begin();
|
||||
m_phraseQueue.erase(m_phraseQueue.begin());
|
||||
return p;
|
||||
}
|
||||
return Phrase(0,0,0,0);
|
||||
}
|
||||
|
||||
void RemoveOverlap(Phrase p)
|
||||
{
|
||||
PhraseQueue ok;
|
||||
for(PhraseQueue::iterator it = m_phraseQueue.begin(); it != m_phraseQueue.end(); it++)
|
||||
{
|
||||
Phrase pp = *it;
|
||||
if(!((p.i <= pp.i && pp.i < p.i + p.m) || (pp.i <= p.i && p.i < pp.i + pp.m) ||
|
||||
(p.j <= pp.j && pp.j < p.j + p.n) || (pp.j <= p.j && p.j < pp.j + pp.n)))
|
||||
ok.insert(pp);
|
||||
}
|
||||
m_phraseQueue = ok;
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
#endif
|
135
moses/src/CompactPT/LexicalReorderingTableCompact.cpp
Normal file
135
moses/src/CompactPT/LexicalReorderingTableCompact.cpp
Normal file
@ -0,0 +1,135 @@
|
||||
#include "LexicalReorderingTableCompact.h"
|
||||
|
||||
namespace Moses {
|
||||
|
||||
LexicalReorderingTableCompact::LexicalReorderingTableCompact(
|
||||
const std::string& filePath,
|
||||
const std::vector<FactorType>& f_factors,
|
||||
const std::vector<FactorType>& e_factors,
|
||||
const std::vector<FactorType>& c_factors)
|
||||
: LexicalReorderingTable(f_factors, e_factors, c_factors),
|
||||
m_inMemory(StaticData::Instance().UseMinlexrInMemory()),
|
||||
m_numScoreComponent(6), m_multipleScoreTrees(true),
|
||||
m_hash(10, 16), m_scoreTrees(1, NULL)
|
||||
{
|
||||
Load(filePath);
|
||||
}
|
||||
|
||||
LexicalReorderingTableCompact::LexicalReorderingTableCompact(
|
||||
const std::vector<FactorType>& f_factors,
|
||||
const std::vector<FactorType>& e_factors,
|
||||
const std::vector<FactorType>& c_factors)
|
||||
: LexicalReorderingTable(f_factors, e_factors, c_factors),
|
||||
m_inMemory(StaticData::Instance().UseMinlexrInMemory()),
|
||||
m_numScoreComponent(6), m_multipleScoreTrees(true),
|
||||
m_hash(10, 16), m_scoreTrees(1, NULL)
|
||||
{ }
|
||||
|
||||
LexicalReorderingTableCompact::~LexicalReorderingTableCompact() {
|
||||
for(size_t i = 0; i < m_scoreTrees.size(); i++)
|
||||
delete m_scoreTrees[i];
|
||||
}
|
||||
|
||||
std::vector<float> LexicalReorderingTableCompact::GetScore(const Phrase& f,
|
||||
const Phrase& e,
|
||||
const Phrase& c)
|
||||
{
|
||||
std::string key;
|
||||
Scores scores;
|
||||
|
||||
if(0 == c.GetSize())
|
||||
key = MakeKey(f, e, c);
|
||||
else
|
||||
for(size_t i = 0; i <= c.GetSize(); ++i)
|
||||
{
|
||||
Phrase sub_c(c.GetSubString(WordsRange(i,c.GetSize()-1)));
|
||||
key = MakeKey(f,e,sub_c);
|
||||
}
|
||||
|
||||
size_t index = m_hash[key];
|
||||
if(m_hash.GetSize() != index)
|
||||
{
|
||||
std::string scoresString;
|
||||
if(m_inMemory)
|
||||
scoresString = m_scoresMemory[index];
|
||||
else
|
||||
scoresString = m_scoresMapped[index];
|
||||
|
||||
BitStream<> bitStream(scoresString);
|
||||
for(size_t i = 0; i < m_numScoreComponent; i++)
|
||||
scores.push_back(m_scoreTrees[m_multipleScoreTrees ? i : 0]->NextSymbol(bitStream));
|
||||
|
||||
return scores;
|
||||
}
|
||||
|
||||
return Scores();
|
||||
}
|
||||
|
||||
std::string LexicalReorderingTableCompact::MakeKey(const Phrase& f,
|
||||
const Phrase& e,
|
||||
const Phrase& c) const
|
||||
{
|
||||
return MakeKey(Trim(f.GetStringRep(m_FactorsF)),
|
||||
Trim(e.GetStringRep(m_FactorsE)),
|
||||
Trim(c.GetStringRep(m_FactorsC)));
|
||||
}
|
||||
|
||||
std::string LexicalReorderingTableCompact::MakeKey(const std::string& f,
|
||||
const std::string& e,
|
||||
const std::string& c) const
|
||||
{
|
||||
std::string key;
|
||||
if(!f.empty())
|
||||
{
|
||||
key += f;
|
||||
}
|
||||
if(!m_FactorsE.empty())
|
||||
{
|
||||
if(!key.empty())
|
||||
{
|
||||
key += " ||| ";
|
||||
}
|
||||
key += e;
|
||||
}
|
||||
if(!m_FactorsC.empty())
|
||||
{
|
||||
if(!key.empty())
|
||||
{
|
||||
key += " ||| ";
|
||||
}
|
||||
key += c;
|
||||
}
|
||||
key += " ||| ";
|
||||
return key;
|
||||
}
|
||||
|
||||
void LexicalReorderingTableCompact::Load(std::string filePath)
|
||||
{
|
||||
std::FILE* pFile = std::fopen(filePath.c_str(), "r");
|
||||
if(m_inMemory)
|
||||
m_hash.Load(pFile);
|
||||
else
|
||||
m_hash.LoadIndex(pFile);
|
||||
|
||||
std::fread(&m_numScoreComponent, sizeof(m_numScoreComponent), 1, pFile);
|
||||
std::fread(&m_multipleScoreTrees, sizeof(m_multipleScoreTrees), 1, pFile);
|
||||
|
||||
if(m_multipleScoreTrees)
|
||||
{
|
||||
m_scoreTrees.resize(m_numScoreComponent);
|
||||
for(size_t i = 0; i < m_numScoreComponent; i++)
|
||||
m_scoreTrees[i] = new CanonicalHuffman<float>(pFile);
|
||||
}
|
||||
else
|
||||
{
|
||||
m_scoreTrees.resize(1);
|
||||
m_scoreTrees[0] = new CanonicalHuffman<float>(pFile);
|
||||
}
|
||||
|
||||
if(m_inMemory)
|
||||
m_scoresMemory.load(pFile, false);
|
||||
else
|
||||
m_scoresMapped.load(pFile, true);
|
||||
}
|
||||
|
||||
}
|
56
moses/src/CompactPT/LexicalReorderingTableCompact.h
Normal file
56
moses/src/CompactPT/LexicalReorderingTableCompact.h
Normal file
@ -0,0 +1,56 @@
|
||||
#ifndef moses_LexicalReorderingTableCompact_h
|
||||
#define moses_LexicalReorderingTableCompact_h
|
||||
|
||||
#include "LexicalReorderingTable.h"
|
||||
#include "StaticData.h"
|
||||
#include "PhraseDictionary.h"
|
||||
#include "GenerationDictionary.h"
|
||||
#include "TargetPhrase.h"
|
||||
#include "TargetPhraseCollection.h"
|
||||
|
||||
#include "CompactPT/BlockHashIndex.h"
|
||||
#include "CompactPT/CanonicalHuffman.h"
|
||||
#include "CompactPT/StringVector.h"
|
||||
|
||||
namespace Moses {
|
||||
|
||||
class LexicalReorderingTableCompact: public LexicalReorderingTable
|
||||
{
|
||||
private:
|
||||
bool m_inMemory;
|
||||
|
||||
size_t m_numScoreComponent;
|
||||
bool m_multipleScoreTrees;
|
||||
|
||||
BlockHashIndex m_hash;
|
||||
|
||||
typedef CanonicalHuffman<float> ScoreTree;
|
||||
std::vector<ScoreTree*> m_scoreTrees;
|
||||
|
||||
StringVector<unsigned char, unsigned long, MmapAllocator> m_scoresMapped;
|
||||
StringVector<unsigned char, unsigned long, std::allocator> m_scoresMemory;
|
||||
|
||||
std::string MakeKey(const Phrase& f, const Phrase& e, const Phrase& c) const;
|
||||
std::string MakeKey(const std::string& f, const std::string& e, const std::string& c) const;
|
||||
|
||||
public:
|
||||
LexicalReorderingTableCompact(
|
||||
const std::string& filePath,
|
||||
const std::vector<FactorType>& f_factors,
|
||||
const std::vector<FactorType>& e_factors,
|
||||
const std::vector<FactorType>& c_factors);
|
||||
|
||||
LexicalReorderingTableCompact(
|
||||
const std::vector<FactorType>& f_factors,
|
||||
const std::vector<FactorType>& e_factors,
|
||||
const std::vector<FactorType>& c_factors);
|
||||
|
||||
virtual ~LexicalReorderingTableCompact();
|
||||
|
||||
virtual std::vector<float> GetScore(const Phrase& f, const Phrase& e, const Phrase& c);
|
||||
void Load(std::string filePath);
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
#endif
|
388
moses/src/CompactPT/LexicalReorderingTableCreator.cpp
Normal file
388
moses/src/CompactPT/LexicalReorderingTableCreator.cpp
Normal file
@ -0,0 +1,388 @@
|
||||
#include "LexicalReorderingTableCreator.h"
|
||||
|
||||
namespace Moses {
|
||||
|
||||
LexicalReorderingTableCreator::LexicalReorderingTableCreator(
|
||||
std::string inPath, std::string outPath, size_t numScoreComponent,
|
||||
size_t orderBits, size_t fingerPrintBits, bool multipleScoreTrees,
|
||||
size_t quantize
|
||||
#ifdef WITH_THREADS
|
||||
, size_t threads
|
||||
#endif
|
||||
)
|
||||
: m_inPath(inPath), m_outPath(outPath), m_numScoreComponent(numScoreComponent),
|
||||
m_orderBits(orderBits), m_fingerPrintBits(fingerPrintBits),
|
||||
m_multipleScoreTrees(multipleScoreTrees), m_quantize(quantize),
|
||||
m_separator(" ||| "), m_hash(m_orderBits, m_fingerPrintBits),
|
||||
m_lastFlushedLine(-1)
|
||||
#ifdef WITH_THREADS
|
||||
, m_threads(threads)
|
||||
#endif
|
||||
{
|
||||
|
||||
m_scoreCounters.resize(m_multipleScoreTrees ? m_numScoreComponent : 1);
|
||||
for(std::vector<ScoreCounter*>::iterator it = m_scoreCounters.begin();
|
||||
it != m_scoreCounters.end(); it++)
|
||||
*it = new ScoreCounter();
|
||||
m_scoreTrees.resize(m_multipleScoreTrees ? m_numScoreComponent : 1);
|
||||
|
||||
if(m_outPath.rfind(".mphlexr") != m_outPath.size() - 8)
|
||||
m_outPath += ".mphlexr";
|
||||
|
||||
PrintInfo();
|
||||
|
||||
m_outFile = std::fopen(m_outPath.c_str(), "w");
|
||||
|
||||
std::cerr << "Pass 1/2: Creating phrase index + Counting scores" << std::endl;
|
||||
m_hash.BeginSave(m_outFile);
|
||||
EncodeScores();
|
||||
|
||||
std::cerr << "Intermezzo: Calculating Huffman code sets" << std::endl;
|
||||
CalcHuffmanCodes();
|
||||
|
||||
std::cerr << "Pass 2/2: Compressing scores" << std::endl;
|
||||
CompressScores();
|
||||
|
||||
std::cerr << "Saving to " << m_outPath << std::endl;
|
||||
Save();
|
||||
std::cerr << "Done" << std::endl;
|
||||
std::fclose(m_outFile);
|
||||
}
|
||||
|
||||
void LexicalReorderingTableCreator::PrintInfo()
|
||||
{
|
||||
std::cerr << "Used options:" << std::endl;
|
||||
std::cerr << "\tText reordering table will be read from: " << m_inPath << std::endl;
|
||||
std::cerr << "\tOuput reordering table will be written to: " << m_outPath << std::endl;
|
||||
std::cerr << "\tStep size for source landmark phrases: 2^" << m_orderBits << "=" << (1ul << m_orderBits) << std::endl;
|
||||
std::cerr << "\tPhrase fingerprint size: " << m_fingerPrintBits << " bits / P(fp)=" << (float(1)/(1ul << m_fingerPrintBits)) << std::endl;
|
||||
std::cerr << "\tNumber of score components in reordering table: " << m_numScoreComponent << std::endl;
|
||||
std::cerr << "\tSingle Huffman code set for score components: " << (m_multipleScoreTrees ? "no" : "yes") << std::endl;
|
||||
std::cerr << "\tUsing score quantization: ";
|
||||
if(m_quantize)
|
||||
std::cerr << m_quantize << " best" << std::endl;
|
||||
else
|
||||
std::cerr << "no" << std::endl;
|
||||
|
||||
#ifdef WITH_THREADS
|
||||
std::cerr << "\tRunning with " << m_threads << " threads" << std::endl;
|
||||
#endif
|
||||
std::cerr << std::endl;
|
||||
}
|
||||
|
||||
void LexicalReorderingTableCreator::EncodeScores()
|
||||
{
|
||||
InputFileStream inFile(m_inPath);
|
||||
|
||||
#ifdef WITH_THREADS
|
||||
boost::thread_group threads;
|
||||
for (size_t i = 0; i < m_threads; ++i)
|
||||
{
|
||||
EncodingTaskReordering* et = new EncodingTaskReordering(inFile, *this);
|
||||
threads.create_thread(*et);
|
||||
}
|
||||
threads.join_all();
|
||||
#else
|
||||
EncodingTaskReordering* et = new EncodingTaskReordering(inFile, *this);
|
||||
(*et)();
|
||||
delete et;
|
||||
#endif
|
||||
FlushEncodedQueue(true);
|
||||
}
|
||||
|
||||
void LexicalReorderingTableCreator::CalcHuffmanCodes()
|
||||
{
|
||||
std::vector<ScoreTree*>::iterator treeIt = m_scoreTrees.begin();
|
||||
for(std::vector<ScoreCounter*>::iterator it = m_scoreCounters.begin();
|
||||
it != m_scoreCounters.end(); it++)
|
||||
{
|
||||
if(m_quantize)
|
||||
(*it)->Quantize(m_quantize);
|
||||
|
||||
std::cerr << "\tCreating Huffman codes for " << (*it)->Size()
|
||||
<< " scores" << std::endl;
|
||||
|
||||
*treeIt = new ScoreTree((*it)->Begin(), (*it)->End());
|
||||
treeIt++;
|
||||
}
|
||||
std::cerr << std::endl;
|
||||
}
|
||||
|
||||
void LexicalReorderingTableCreator::CompressScores()
|
||||
{
|
||||
#ifdef WITH_THREADS
|
||||
boost::thread_group threads;
|
||||
for (size_t i = 0; i < m_threads; ++i) {
|
||||
CompressionTaskReordering* ct = new CompressionTaskReordering(m_encodedScores, *this);
|
||||
threads.create_thread(*ct);
|
||||
}
|
||||
threads.join_all();
|
||||
#else
|
||||
CompressionTaskReordering* ct = new CompressionTaskReordering(m_scores, *this);
|
||||
(*ct)();
|
||||
delete ct;
|
||||
#endif
|
||||
FlushCompressedQueue(true);
|
||||
}
|
||||
|
||||
void LexicalReorderingTableCreator::Save()
|
||||
{
|
||||
std::fwrite(&m_numScoreComponent, sizeof(m_numScoreComponent), 1, m_outFile);
|
||||
std::fwrite(&m_multipleScoreTrees, sizeof(m_multipleScoreTrees), 1, m_outFile);
|
||||
for(size_t i = 0; i < m_scoreTrees.size(); i++)
|
||||
m_scoreTrees[i]->Save(m_outFile);
|
||||
|
||||
m_compressedScores.save(m_outFile);
|
||||
}
|
||||
|
||||
std::string LexicalReorderingTableCreator::MakeSourceTargetKey(std::string &source, std::string &target)
|
||||
{
|
||||
return source + m_separator + target + m_separator;
|
||||
}
|
||||
|
||||
std::string LexicalReorderingTableCreator::EncodeLine(std::vector<std::string>& tokens)
|
||||
{
|
||||
std::string scoresString = tokens[2];
|
||||
std::stringstream scoresStream;
|
||||
|
||||
std::vector<float> scores;
|
||||
Tokenize<float>(scores, scoresString);
|
||||
|
||||
size_t c = 0;
|
||||
float score;
|
||||
while(c < m_numScoreComponent)
|
||||
{
|
||||
score = scores[c];
|
||||
score = FloorScore(TransformScore(score));
|
||||
scoresStream.write((char*)&score, sizeof(score));
|
||||
|
||||
m_scoreCounters[m_multipleScoreTrees ? c : 0]->Increase(score);
|
||||
c++;
|
||||
}
|
||||
|
||||
return scoresStream.str();
|
||||
}
|
||||
|
||||
void LexicalReorderingTableCreator::AddEncodedLine(PackedItem& pi)
|
||||
{
|
||||
m_queue.push(pi);
|
||||
}
|
||||
|
||||
void LexicalReorderingTableCreator::FlushEncodedQueue(bool force) {
|
||||
if(force || m_queue.size() > 10000)
|
||||
{
|
||||
while(!m_queue.empty() && m_lastFlushedLine + 1 == m_queue.top().GetLine())
|
||||
{
|
||||
PackedItem pi = m_queue.top();
|
||||
m_queue.pop();
|
||||
m_lastFlushedLine++;
|
||||
|
||||
m_lastRange.push_back(pi.GetSrc());
|
||||
m_encodedScores.push_back(pi.GetTrg());
|
||||
|
||||
if((pi.GetLine()+1) % 100000 == 0)
|
||||
std::cerr << ".";
|
||||
if((pi.GetLine()+1) % 5000000 == 0)
|
||||
std::cerr << "[" << (pi.GetLine()+1) << "]" << std::endl;
|
||||
|
||||
if(m_lastRange.size() == (1ul << m_orderBits))
|
||||
{
|
||||
m_hash.AddRange(m_lastRange);
|
||||
m_hash.SaveLastRange();
|
||||
m_hash.DropLastRange();
|
||||
m_lastRange.clear();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if(force)
|
||||
{
|
||||
m_lastFlushedLine = -1;
|
||||
|
||||
m_hash.AddRange(m_lastRange);
|
||||
m_lastRange.clear();
|
||||
|
||||
#ifdef WITH_THREADS
|
||||
m_hash.WaitAll();
|
||||
#endif
|
||||
|
||||
m_hash.SaveLastRange();
|
||||
m_hash.DropLastRange();
|
||||
m_hash.FinalizeSave();
|
||||
|
||||
std::cerr << std::endl << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
std::string LexicalReorderingTableCreator::CompressEncodedScores(std::string &encodedScores) {
|
||||
std::stringstream encodedScoresStream(encodedScores);
|
||||
encodedScoresStream.unsetf(std::ios::skipws);
|
||||
|
||||
std::string compressedScores;
|
||||
BitStream<> compressedScoresStream(compressedScores);
|
||||
|
||||
size_t currScore = 0;
|
||||
float score;
|
||||
encodedScoresStream.read((char*) &score, sizeof(score));
|
||||
|
||||
while(encodedScoresStream) {
|
||||
size_t index = currScore % m_scoreTrees.size();
|
||||
|
||||
if(m_quantize)
|
||||
score = m_scoreCounters[index]->LowerBound(score);
|
||||
|
||||
compressedScoresStream.PutCode(m_scoreTrees[index]->Encode(score));
|
||||
encodedScoresStream.read((char*) &score, sizeof(score));
|
||||
currScore++;
|
||||
}
|
||||
|
||||
return compressedScores;
|
||||
}
|
||||
|
||||
void LexicalReorderingTableCreator::AddCompressedScores(PackedItem& pi) {
|
||||
m_queue.push(pi);
|
||||
}
|
||||
|
||||
void LexicalReorderingTableCreator::FlushCompressedQueue(bool force)
|
||||
{
|
||||
if(force || m_queue.size() > 10000)
|
||||
{
|
||||
while(!m_queue.empty() && m_lastFlushedLine + 1 == m_queue.top().GetLine())
|
||||
{
|
||||
PackedItem pi = m_queue.top();
|
||||
m_queue.pop();
|
||||
m_lastFlushedLine++;
|
||||
|
||||
m_compressedScores.push_back(pi.GetTrg());
|
||||
|
||||
if((pi.GetLine()+1) % 100000 == 0)
|
||||
std::cerr << ".";
|
||||
if((pi.GetLine()+1) % 5000000 == 0)
|
||||
std::cerr << "[" << (pi.GetLine()+1) << "]" << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
if(force)
|
||||
{
|
||||
m_lastFlushedLine = -1;
|
||||
std::cerr << std::endl << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
//****************************************************************************//
|
||||
|
||||
size_t EncodingTaskReordering::m_lineNum = 0;
|
||||
#ifdef WITH_THREADS
|
||||
boost::mutex EncodingTaskReordering::m_mutex;
|
||||
boost::mutex EncodingTaskReordering::m_fileMutex;
|
||||
#endif
|
||||
|
||||
EncodingTaskReordering::EncodingTaskReordering(InputFileStream& inFile, LexicalReorderingTableCreator& creator)
|
||||
: m_inFile(inFile), m_creator(creator) {}
|
||||
|
||||
void EncodingTaskReordering::operator()()
|
||||
{
|
||||
size_t lineNum = 0;
|
||||
|
||||
std::vector<std::string> lines;
|
||||
size_t max_lines = 1000;
|
||||
lines.reserve(max_lines);
|
||||
|
||||
{
|
||||
#ifdef WITH_THREADS
|
||||
boost::mutex::scoped_lock lock(m_fileMutex);
|
||||
#endif
|
||||
std::string line;
|
||||
while(lines.size() < max_lines && std::getline(m_inFile, line))
|
||||
lines.push_back(line);
|
||||
lineNum = m_lineNum;
|
||||
m_lineNum += lines.size();
|
||||
}
|
||||
|
||||
std::vector<PackedItem> result;
|
||||
result.reserve(max_lines);
|
||||
|
||||
while(lines.size())
|
||||
{
|
||||
for(size_t i = 0; i < lines.size(); i++)
|
||||
{
|
||||
std::vector<std::string> tokens;
|
||||
Moses::TokenizeMultiCharSeparator(tokens, lines[i], m_creator.m_separator);
|
||||
|
||||
std::string encodedLine = m_creator.EncodeLine(tokens);
|
||||
|
||||
PackedItem packedItem(lineNum + i, m_creator.MakeSourceTargetKey(tokens[0], tokens[1]),
|
||||
encodedLine, i);
|
||||
result.push_back(packedItem);
|
||||
}
|
||||
lines.clear();
|
||||
|
||||
{
|
||||
#ifdef WITH_THREADS
|
||||
boost::mutex::scoped_lock lock(m_mutex);
|
||||
#endif
|
||||
for(size_t i = 0; i < result.size(); i++)
|
||||
m_creator.AddEncodedLine(result[i]);
|
||||
m_creator.FlushEncodedQueue();
|
||||
}
|
||||
|
||||
result.clear();
|
||||
lines.reserve(max_lines);
|
||||
result.reserve(max_lines);
|
||||
|
||||
#ifdef WITH_THREADS
|
||||
boost::mutex::scoped_lock lock(m_fileMutex);
|
||||
#endif
|
||||
std::string line;
|
||||
while(lines.size() < max_lines && std::getline(m_inFile, line))
|
||||
lines.push_back(line);
|
||||
lineNum = m_lineNum;
|
||||
m_lineNum += lines.size();
|
||||
}
|
||||
}
|
||||
|
||||
//****************************************************************************//
|
||||
|
||||
size_t CompressionTaskReordering::m_scoresNum = 0;
|
||||
#ifdef WITH_THREADS
|
||||
boost::mutex CompressionTaskReordering::m_mutex;
|
||||
#endif
|
||||
|
||||
CompressionTaskReordering::CompressionTaskReordering(StringVector<unsigned char, unsigned long,
|
||||
MmapAllocator>& encodedScores,
|
||||
LexicalReorderingTableCreator& creator)
|
||||
: m_encodedScores(encodedScores), m_creator(creator)
|
||||
{ }
|
||||
|
||||
void CompressionTaskReordering::operator()()
|
||||
{
|
||||
size_t scoresNum;
|
||||
{
|
||||
#ifdef WITH_THREADS
|
||||
boost::mutex::scoped_lock lock(m_mutex);
|
||||
#endif
|
||||
scoresNum = m_scoresNum;
|
||||
m_scoresNum++;
|
||||
}
|
||||
|
||||
while(scoresNum < m_encodedScores.size())
|
||||
{
|
||||
std::string scores = m_encodedScores[scoresNum];
|
||||
std::string compressedScores
|
||||
= m_creator.CompressEncodedScores(scores);
|
||||
|
||||
std::string dummy;
|
||||
PackedItem packedItem(scoresNum, dummy, compressedScores, 0);
|
||||
|
||||
#ifdef WITH_THREADS
|
||||
boost::mutex::scoped_lock lock(m_mutex);
|
||||
#endif
|
||||
m_creator.AddCompressedScores(packedItem);
|
||||
m_creator.FlushCompressedQueue();
|
||||
|
||||
scoresNum = m_scoresNum;
|
||||
m_scoresNum++;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
117
moses/src/CompactPT/LexicalReorderingTableCreator.h
Normal file
117
moses/src/CompactPT/LexicalReorderingTableCreator.h
Normal file
@ -0,0 +1,117 @@
|
||||
#ifndef moses_LexicalReorderingTableCreator_h
|
||||
#define moses_LexicalReorderingTableCreator_h
|
||||
|
||||
#include "PhraseTableCreator.h"
|
||||
|
||||
namespace Moses {
|
||||
|
||||
class LexicalReorderingTableCreator {
|
||||
private:
|
||||
std::string m_inPath;
|
||||
std::string m_outPath;
|
||||
|
||||
std::FILE* m_outFile;
|
||||
|
||||
size_t m_orderBits;
|
||||
size_t m_fingerPrintBits;
|
||||
|
||||
size_t m_numScoreComponent;
|
||||
|
||||
bool m_multipleScoreTrees;
|
||||
bool m_quantize;
|
||||
|
||||
std::string m_separator;
|
||||
|
||||
BlockHashIndex m_hash;
|
||||
|
||||
#ifdef WITH_THREADS
|
||||
size_t m_threads;
|
||||
#endif
|
||||
|
||||
typedef Counter<float> ScoreCounter;
|
||||
typedef CanonicalHuffman<float> ScoreTree;
|
||||
|
||||
std::vector<ScoreCounter*> m_scoreCounters;
|
||||
std::vector<ScoreTree*> m_scoreTrees;
|
||||
|
||||
StringVector<unsigned char, unsigned long, MmapAllocator> m_encodedScores;
|
||||
StringVector<unsigned char, unsigned long, MmapAllocator> m_compressedScores;
|
||||
|
||||
std::priority_queue<PackedItem> m_queue;
|
||||
long m_lastFlushedLine;
|
||||
long m_lastFlushedSourceNum;
|
||||
std::string m_lastFlushedSourcePhrase;
|
||||
std::vector<std::string> m_lastRange;
|
||||
|
||||
void PrintInfo();
|
||||
|
||||
void EncodeScores();
|
||||
void CalcHuffmanCodes();
|
||||
void CompressScores();
|
||||
void Save();
|
||||
|
||||
std::string MakeSourceTargetKey(std::string&, std::string&);
|
||||
|
||||
std::string EncodeLine(std::vector<std::string>& tokens);
|
||||
void AddEncodedLine(PackedItem& pi);
|
||||
void FlushEncodedQueue(bool force = false);
|
||||
|
||||
std::string CompressEncodedScores(std::string &encodedScores);
|
||||
void AddCompressedScores(PackedItem& pi);
|
||||
void FlushCompressedQueue(bool force = false);
|
||||
|
||||
public:
|
||||
LexicalReorderingTableCreator(std::string inPath,
|
||||
std::string outPath,
|
||||
size_t numScoreComponent = 6,
|
||||
size_t orderBits = 10,
|
||||
size_t fingerPrintBits = 16,
|
||||
bool multipleScoreTrees = true,
|
||||
size_t quantize = 0
|
||||
#ifdef WITH_THREADS
|
||||
, size_t threads = 2
|
||||
#endif
|
||||
);
|
||||
|
||||
friend class EncodingTaskReordering;
|
||||
friend class CompressionTaskReordering;
|
||||
};
|
||||
|
||||
class EncodingTaskReordering
|
||||
{
|
||||
private:
|
||||
#ifdef WITH_THREADS
|
||||
static boost::mutex m_mutex;
|
||||
static boost::mutex m_fileMutex;
|
||||
#endif
|
||||
static size_t m_lineNum;
|
||||
static size_t m_sourcePhraseNum;
|
||||
static std::string m_lastSourcePhrase;
|
||||
|
||||
InputFileStream& m_inFile;
|
||||
LexicalReorderingTableCreator& m_creator;
|
||||
|
||||
public:
|
||||
EncodingTaskReordering(InputFileStream& inFile, LexicalReorderingTableCreator& creator);
|
||||
void operator()();
|
||||
};
|
||||
|
||||
class CompressionTaskReordering
|
||||
{
|
||||
private:
|
||||
#ifdef WITH_THREADS
|
||||
static boost::mutex m_mutex;
|
||||
#endif
|
||||
static size_t m_scoresNum;
|
||||
StringVector<unsigned char, unsigned long, MmapAllocator> &m_encodedScores;
|
||||
LexicalReorderingTableCreator &m_creator;
|
||||
|
||||
public:
|
||||
CompressionTaskReordering(StringVector<unsigned char, unsigned long, MmapAllocator>&
|
||||
m_encodedScores, LexicalReorderingTableCreator& creator);
|
||||
void operator()();
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
#endif
|
291
moses/src/CompactPT/ListCoders.h
Normal file
291
moses/src/CompactPT/ListCoders.h
Normal file
@ -0,0 +1,291 @@
|
||||
#ifndef moses_ListCoders_h
|
||||
#define moses_ListCoders_h
|
||||
|
||||
#include <cmath>
|
||||
#include <cassert>
|
||||
|
||||
namespace Moses
|
||||
{
|
||||
|
||||
template <typename T = unsigned int>
|
||||
class VarIntType
|
||||
{
|
||||
private:
|
||||
template <typename IntType, typename OutIt>
|
||||
static void EncodeSymbol(IntType input, OutIt output)
|
||||
{
|
||||
if(input == 0)
|
||||
{
|
||||
*output = 0;
|
||||
output++;
|
||||
return;
|
||||
}
|
||||
|
||||
T msb = 1 << (sizeof(T)*8-1);
|
||||
IntType mask = ~msb;
|
||||
IntType shift = (sizeof(T)*8-1);
|
||||
|
||||
while(input)
|
||||
{
|
||||
T res = input & mask;
|
||||
input >>= shift;
|
||||
if(input)
|
||||
res |= msb;
|
||||
*output = res;
|
||||
output++;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename InIt, typename IntType>
|
||||
static void DecodeSymbol(InIt &it, InIt end, IntType &output)
|
||||
{
|
||||
T msb = 1 << (sizeof(T)*8-1);
|
||||
IntType shift = (sizeof(T)*8-1);
|
||||
|
||||
output = 0;
|
||||
size_t i = 0;
|
||||
while(it != end && *it & msb) {
|
||||
IntType temp = *it & ~msb;
|
||||
temp <<= shift*i;
|
||||
output |= temp;
|
||||
it++; i++;
|
||||
}
|
||||
assert(it != end);
|
||||
|
||||
IntType temp = *it;
|
||||
temp <<= shift*i;
|
||||
output |= temp;
|
||||
it++;
|
||||
}
|
||||
|
||||
|
||||
|
||||
public:
|
||||
|
||||
template <typename InIt, typename OutIt>
|
||||
static void Encode(InIt it, InIt end, OutIt outIt)
|
||||
{
|
||||
while(it != end)
|
||||
{
|
||||
EncodeSymbol(*it, outIt);
|
||||
it++;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template <typename InIt, typename OutIt>
|
||||
static void Decode(InIt &it, InIt end, OutIt outIt)
|
||||
{
|
||||
while(it != end)
|
||||
{
|
||||
size_t output;
|
||||
DecodeSymbol(it, end, output);
|
||||
*outIt = output;
|
||||
outIt++;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename InIt>
|
||||
static size_t DecodeAndSum(InIt &it, InIt end, size_t num)
|
||||
{
|
||||
size_t sum = 0;
|
||||
size_t curr = 0;
|
||||
|
||||
while(it != end && curr < num)
|
||||
{
|
||||
size_t output;
|
||||
DecodeSymbol(it, end, output);
|
||||
sum += output; curr++;
|
||||
}
|
||||
|
||||
return sum;
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
typedef VarIntType<unsigned char> VarByte;
|
||||
|
||||
typedef VarByte VarInt8;
|
||||
typedef VarIntType<unsigned short> VarInt16;
|
||||
typedef VarIntType<unsigned int> VarInt32;
|
||||
|
||||
class Simple9
|
||||
{
|
||||
private:
|
||||
typedef unsigned int uint;
|
||||
|
||||
template <typename InIt>
|
||||
inline static void EncodeSymbol(uint &output, InIt it, InIt end)
|
||||
{
|
||||
uint length = end - it;
|
||||
|
||||
uint type;
|
||||
uint bitlength;
|
||||
|
||||
switch(length)
|
||||
{
|
||||
case 1: type = 1; bitlength = 28; break;
|
||||
case 2: type = 2; bitlength = 14; break;
|
||||
case 3: type = 3; bitlength = 9; break;
|
||||
case 4: type = 4; bitlength = 7; break;
|
||||
case 5: type = 5; bitlength = 5; break;
|
||||
case 7: type = 6; bitlength = 4; break;
|
||||
case 9: type = 7; bitlength = 3; break;
|
||||
case 14: type = 8; bitlength = 2; break;
|
||||
case 28: type = 9; bitlength = 1; break;
|
||||
}
|
||||
|
||||
output = 0;
|
||||
output |= (type << 28);
|
||||
|
||||
uint i = 0;
|
||||
while(it != end)
|
||||
{
|
||||
uint l = bitlength * (length-i-1);
|
||||
output |= *it << l;
|
||||
it++;
|
||||
i++;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename OutIt>
|
||||
static inline void DecodeSymbol(uint input, OutIt outIt)
|
||||
{
|
||||
uint type = (input >> 28);
|
||||
|
||||
uint bitlen;
|
||||
uint shift;
|
||||
uint mask;
|
||||
|
||||
switch(type)
|
||||
{
|
||||
case 1: bitlen = 28; shift = 0; mask = 268435455; break;
|
||||
case 2: bitlen = 14; shift = 14; mask = 16383; break;
|
||||
case 3: bitlen = 9; shift = 18; mask = 511; break;
|
||||
case 4: bitlen = 7; shift = 21; mask = 127; break;
|
||||
case 5: bitlen = 5; shift = 20; mask = 31; break;
|
||||
case 6: bitlen = 4; shift = 24; mask = 15; break;
|
||||
case 7: bitlen = 3; shift = 24; mask = 7; break;
|
||||
case 8: bitlen = 2; shift = 26; mask = 3; break;
|
||||
case 9: bitlen = 1; shift = 27; mask = 1; break;
|
||||
}
|
||||
|
||||
while(shift > 0)
|
||||
{
|
||||
*outIt = (input >> shift) & mask;
|
||||
shift -= bitlen;
|
||||
outIt++;
|
||||
}
|
||||
*outIt = input & mask;
|
||||
outIt++;
|
||||
}
|
||||
|
||||
static inline size_t DecodeAndSumSymbol(uint input, size_t num, size_t &curr)
|
||||
{
|
||||
uint type = (input >> 28);
|
||||
|
||||
uint bitlen;
|
||||
uint shift;
|
||||
uint mask;
|
||||
|
||||
switch(type)
|
||||
{
|
||||
case 1: bitlen = 28; shift = 0; mask = 268435455; break;
|
||||
case 2: bitlen = 14; shift = 14; mask = 16383; break;
|
||||
case 3: bitlen = 9; shift = 18; mask = 511; break;
|
||||
case 4: bitlen = 7; shift = 21; mask = 127; break;
|
||||
case 5: bitlen = 5; shift = 20; mask = 31; break;
|
||||
case 6: bitlen = 4; shift = 24; mask = 15; break;
|
||||
case 7: bitlen = 3; shift = 24; mask = 7; break;
|
||||
case 8: bitlen = 2; shift = 26; mask = 3; break;
|
||||
case 9: bitlen = 1; shift = 27; mask = 1; break;
|
||||
}
|
||||
|
||||
size_t sum = 0;
|
||||
while(shift > 0)
|
||||
{
|
||||
sum += (input >> shift) & mask;
|
||||
shift -= bitlen;
|
||||
if(++curr == num)
|
||||
return sum;
|
||||
}
|
||||
sum += input & mask;
|
||||
curr++;
|
||||
return sum;
|
||||
}
|
||||
|
||||
public:
|
||||
template <typename InIt, typename OutIt>
|
||||
static void Encode(InIt it, InIt end, OutIt outIt)
|
||||
{
|
||||
uint parts[] = { 1, 2, 3, 4, 5, 7, 9, 14, 28 };
|
||||
|
||||
uint buffer[28];
|
||||
for(InIt i = it; i < end; i++)
|
||||
{
|
||||
uint lastbit = 1;
|
||||
uint lastpos = 0;
|
||||
uint lastyes = 0;
|
||||
uint j = 0;
|
||||
|
||||
double log2 = log(2);
|
||||
while(j < 9 && lastpos < 28 && (i+lastpos) < end)
|
||||
{
|
||||
if(lastpos >= parts[j])
|
||||
j++;
|
||||
|
||||
buffer[lastpos] = *(i + lastpos);
|
||||
|
||||
uint reqbit = ceil(log(buffer[lastpos]+1)/log2);
|
||||
assert(reqbit <= 28);
|
||||
|
||||
uint bit = 28/floor(28/reqbit);
|
||||
if(lastbit < bit)
|
||||
lastbit = bit;
|
||||
|
||||
if(parts[j] > 28/lastbit)
|
||||
break;
|
||||
else if(lastpos == parts[j]-1)
|
||||
lastyes = lastpos;
|
||||
|
||||
lastpos++;
|
||||
}
|
||||
i += lastyes;
|
||||
|
||||
uint length = lastyes + 1;
|
||||
uint output;
|
||||
EncodeSymbol(output, buffer, buffer + length);
|
||||
|
||||
*outIt = output;
|
||||
outIt++;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename InIt, typename OutIt>
|
||||
static void Decode(InIt &it, InIt end, OutIt outIt)
|
||||
{
|
||||
while(it != end)
|
||||
{
|
||||
DecodeSymbol(*it, outIt);
|
||||
it++;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename InIt>
|
||||
static size_t DecodeAndSum(InIt &it, InIt end, size_t num)
|
||||
{
|
||||
size_t sum = 0;
|
||||
size_t curr = 0;
|
||||
while(it != end && curr < num)
|
||||
{
|
||||
sum += DecodeAndSumSymbol(*it, num, curr);
|
||||
it++;
|
||||
}
|
||||
assert(curr == num);
|
||||
return sum;
|
||||
}
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
#endif
|
175
moses/src/CompactPT/MmapAllocator.h
Normal file
175
moses/src/CompactPT/MmapAllocator.h
Normal file
@ -0,0 +1,175 @@
|
||||
#ifndef moses_MmapAllocator_h
|
||||
#define moses_MmapAllocator_h
|
||||
|
||||
#include <limits>
|
||||
#include <iostream>
|
||||
#include <sys/mman.h>
|
||||
#include <cstdio>
|
||||
#include <unistd.h>
|
||||
|
||||
namespace Moses
|
||||
{
|
||||
template <class T>
|
||||
class MmapAllocator
|
||||
{
|
||||
protected:
|
||||
std::FILE* m_file_ptr;
|
||||
size_t m_file_desc;
|
||||
|
||||
size_t m_page_size;
|
||||
size_t m_map_size;
|
||||
|
||||
char* m_data_ptr;
|
||||
size_t m_data_offset;
|
||||
bool m_fixed;
|
||||
|
||||
public:
|
||||
typedef T value_type;
|
||||
typedef T* pointer;
|
||||
typedef const T* const_pointer;
|
||||
typedef T& reference;
|
||||
typedef const T& const_reference;
|
||||
typedef std::size_t size_type;
|
||||
typedef std::ptrdiff_t difference_type;
|
||||
|
||||
MmapAllocator() throw()
|
||||
: m_file_ptr(std::tmpfile()), m_file_desc(fileno(m_file_ptr)),
|
||||
m_page_size(sysconf(_SC_PAGE_SIZE)), m_map_size(0), m_data_ptr(0),
|
||||
m_data_offset(0), m_fixed(false)
|
||||
{ }
|
||||
|
||||
MmapAllocator(std::FILE* f_ptr) throw()
|
||||
: m_file_ptr(f_ptr), m_file_desc(fileno(m_file_ptr)),
|
||||
m_page_size(sysconf(_SC_PAGE_SIZE)), m_map_size(0), m_data_ptr(0),
|
||||
m_data_offset(0), m_fixed(false)
|
||||
{ }
|
||||
|
||||
MmapAllocator(std::FILE* f_ptr, size_t data_offset = 0) throw()
|
||||
: m_file_ptr(f_ptr), m_file_desc(fileno(m_file_ptr)),
|
||||
m_page_size(sysconf(_SC_PAGE_SIZE)), m_map_size(0), m_data_ptr(0),
|
||||
m_data_offset(data_offset), m_fixed(true)
|
||||
{ }
|
||||
|
||||
MmapAllocator(std::string fileName) throw()
|
||||
: m_file_ptr(std::fopen(fileName.c_str(), "wb+")), m_file_desc(fileno(m_file_ptr)),
|
||||
m_page_size(sysconf(_SC_PAGE_SIZE)), m_map_size(0), m_data_ptr(0),
|
||||
m_data_offset(0), m_fixed(false)
|
||||
{ }
|
||||
|
||||
MmapAllocator(const MmapAllocator& c) throw()
|
||||
: m_file_ptr(c.m_file_ptr), m_file_desc(c.m_file_desc),
|
||||
m_page_size(c.m_page_size), m_map_size(c.m_map_size),
|
||||
m_data_ptr(c.m_data_ptr), m_data_offset(c.m_data_offset),
|
||||
m_fixed(c.m_fixed)
|
||||
{ }
|
||||
|
||||
~MmapAllocator() throw()
|
||||
{
|
||||
if(m_data_ptr)
|
||||
{
|
||||
munmap(m_data_ptr, m_map_size);
|
||||
if(!m_fixed && std::ftell(m_file_ptr) != -1)
|
||||
std::fclose(m_file_ptr);
|
||||
}
|
||||
}
|
||||
|
||||
template <class U>
|
||||
struct rebind {
|
||||
typedef MmapAllocator<U> other;
|
||||
};
|
||||
|
||||
pointer address (reference value) const
|
||||
{
|
||||
return &value;
|
||||
}
|
||||
|
||||
const_pointer address (const_reference value) const
|
||||
{
|
||||
return &value;
|
||||
}
|
||||
|
||||
size_type max_size () const throw()
|
||||
{
|
||||
return std::numeric_limits<size_t>::max() / sizeof(value_type);
|
||||
}
|
||||
|
||||
pointer allocate (size_type num, const void* = 0)
|
||||
{
|
||||
m_map_size = num * sizeof(T);
|
||||
|
||||
if(!m_fixed)
|
||||
{
|
||||
ftruncate(m_file_desc, m_map_size);
|
||||
m_data_ptr = (char*)mmap(0, m_map_size, PROT_READ|PROT_WRITE, MAP_SHARED,
|
||||
m_file_desc, 0);
|
||||
return (pointer)m_data_ptr;
|
||||
}
|
||||
else
|
||||
{
|
||||
size_t map_offset = (m_data_offset / m_page_size) * m_page_size;
|
||||
size_t relative_offset = m_data_offset - map_offset;
|
||||
|
||||
size_t map_size = m_map_size + relative_offset;
|
||||
|
||||
m_data_ptr = (char*)mmap(0, map_size, PROT_READ, MAP_SHARED,
|
||||
m_file_desc, map_offset);
|
||||
|
||||
return (pointer)(m_data_ptr + relative_offset);
|
||||
}
|
||||
}
|
||||
|
||||
void deallocate (pointer p, size_type num)
|
||||
{
|
||||
if(!m_fixed)
|
||||
munmap(p, num * sizeof(T));
|
||||
else {
|
||||
size_t map_offset = (m_data_offset / m_page_size) * m_page_size;
|
||||
size_t relative_offset = m_data_offset - map_offset;
|
||||
munmap((pointer)((char*)p - relative_offset), num * sizeof(T));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
void construct (pointer p, const T& value)
|
||||
{
|
||||
if(!m_fixed)
|
||||
new(p) value_type(value);
|
||||
}
|
||||
void destroy (pointer p)
|
||||
{
|
||||
if(!m_fixed)
|
||||
p->~T();
|
||||
}
|
||||
|
||||
template <class T1, class T2>
|
||||
friend bool operator== (const MmapAllocator<T1>&, const MmapAllocator<T2>&) throw();
|
||||
|
||||
template <class T1, class T2>
|
||||
friend bool operator!= (const MmapAllocator<T1>&, const MmapAllocator<T2>&) throw();
|
||||
};
|
||||
|
||||
template <class T1, class T2>
|
||||
bool operator== (const MmapAllocator<T1>& a1,
|
||||
const MmapAllocator<T2>& a2) throw()
|
||||
{
|
||||
bool equal = true;
|
||||
equal &= a1.m_file_ptr == a2.m_file_ptr;
|
||||
equal &= a1.m_file_desc == a2.m_file_desc;
|
||||
equal &= a1.m_page_size == a2.m_page_size;
|
||||
equal &= a1.m_map_size == a2.m_map_size;
|
||||
equal &= a1.m_data_ptr == a2.m_data_ptr;
|
||||
equal &= a1.m_data_offset == a2.m_data_offset;
|
||||
equal &= a1.m_fixed == a2.m_fixed;
|
||||
return equal;
|
||||
}
|
||||
|
||||
template <class T1, class T2>
|
||||
bool operator!=(const MmapAllocator<T1>& a1,
|
||||
const MmapAllocator<T2>& a2) throw()
|
||||
{
|
||||
return !(a1 == a2);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#endif
|
227
moses/src/CompactPT/MonotonicVector.h
Normal file
227
moses/src/CompactPT/MonotonicVector.h
Normal file
@ -0,0 +1,227 @@
|
||||
#ifndef moses_MonotonicVector_h
|
||||
#define moses_MonotonicVector_h
|
||||
|
||||
// MonotonicVector - Represents a monotonic increasing function that maps
|
||||
// positive integers of any size onto a given number type. Each value has to be
|
||||
// equal or larger than the previous one. Depending on the stepSize it can save
|
||||
// up to 90% of memory compared to a std::vector<long>. Time complexity is roughly
|
||||
// constant, in the worst case, however, stepSize times slower than a normal
|
||||
// std::vector.
|
||||
|
||||
#include <vector>
|
||||
#include <limits>
|
||||
#include <algorithm>
|
||||
#include <cstdio>
|
||||
#include <cassert>
|
||||
|
||||
#include "ListCoders.h"
|
||||
#include "MmapAllocator.h"
|
||||
|
||||
namespace Moses
|
||||
{
|
||||
|
||||
template<typename PosT = size_t, typename NumT = size_t, PosT stepSize = 32,
|
||||
template <typename> class Allocator = std::allocator>
|
||||
class MonotonicVector
|
||||
{
|
||||
private:
|
||||
typedef std::vector<NumT, Allocator<NumT> > Anchors;
|
||||
typedef std::vector<unsigned int, Allocator<unsigned int> > Diffs;
|
||||
|
||||
Anchors m_anchors;
|
||||
Diffs m_diffs;
|
||||
std::vector<unsigned int> m_tempDiffs;
|
||||
|
||||
size_t m_size;
|
||||
PosT m_last;
|
||||
bool m_final;
|
||||
|
||||
public:
|
||||
typedef PosT value_type;
|
||||
|
||||
MonotonicVector() : m_size(0), m_last(0), m_final(false) {}
|
||||
|
||||
size_t size() const
|
||||
{
|
||||
return m_size + m_tempDiffs.size();
|
||||
}
|
||||
|
||||
PosT at(size_t i) const
|
||||
{
|
||||
PosT s = stepSize;
|
||||
PosT j = m_anchors[i / s];
|
||||
PosT r = i % s;
|
||||
|
||||
typename Diffs::const_iterator it = m_diffs.begin() + j;
|
||||
|
||||
PosT k = 0;
|
||||
k += VarInt32::DecodeAndSum(it, m_diffs.end(), 1);
|
||||
if(i < m_size)
|
||||
k += Simple9::DecodeAndSum(it, m_diffs.end(), r);
|
||||
else if(i < m_size + m_tempDiffs.size())
|
||||
for(size_t l = 0; l < r; l++)
|
||||
k += m_tempDiffs[l];
|
||||
|
||||
return k;
|
||||
}
|
||||
|
||||
PosT operator[](PosT i) const
|
||||
{
|
||||
return at(i);
|
||||
}
|
||||
|
||||
PosT back() const
|
||||
{
|
||||
return at(size()-1);
|
||||
}
|
||||
|
||||
void push_back(PosT i)
|
||||
{
|
||||
assert(m_final != true);
|
||||
|
||||
if(m_anchors.size() == 0 && m_tempDiffs.size() == 0)
|
||||
{
|
||||
m_anchors.push_back(0);
|
||||
VarInt32::Encode(&i, &i+1, std::back_inserter(m_diffs));
|
||||
m_last = i;
|
||||
m_size++;
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
if(m_tempDiffs.size() == stepSize-1)
|
||||
{
|
||||
Simple9::Encode(m_tempDiffs.begin(), m_tempDiffs.end(),
|
||||
std::back_inserter(m_diffs));
|
||||
m_anchors.push_back(m_diffs.size());
|
||||
VarInt32::Encode(&i, &i+1, std::back_inserter(m_diffs));
|
||||
|
||||
m_size += m_tempDiffs.size() + 1;
|
||||
m_tempDiffs.clear();
|
||||
}
|
||||
else
|
||||
{
|
||||
PosT last = m_last;
|
||||
PosT diff = i - last;
|
||||
m_tempDiffs.push_back(diff);
|
||||
}
|
||||
m_last = i;
|
||||
}
|
||||
|
||||
void commit()
|
||||
{
|
||||
assert(m_final != true);
|
||||
Simple9::Encode(m_tempDiffs.begin(), m_tempDiffs.end(),
|
||||
std::back_inserter(m_diffs));
|
||||
m_size += m_tempDiffs.size();
|
||||
m_tempDiffs.clear();
|
||||
m_final = true;
|
||||
}
|
||||
|
||||
size_t usage()
|
||||
{
|
||||
return m_diffs.size() * sizeof(unsigned int)
|
||||
+ m_anchors.size() * sizeof(NumT);
|
||||
}
|
||||
|
||||
size_t load(std::FILE* in, bool map = false)
|
||||
{
|
||||
size_t byteSize = 0;
|
||||
|
||||
byteSize += fread(&m_final, sizeof(bool), 1, in) * sizeof(bool);
|
||||
byteSize += fread(&m_size, sizeof(size_t), 1, in) * sizeof(size_t);
|
||||
byteSize += fread(&m_last, sizeof(PosT), 1, in) * sizeof(PosT);
|
||||
|
||||
byteSize += loadVector(m_diffs, in, map);
|
||||
byteSize += loadVector(m_anchors, in, map);
|
||||
|
||||
return byteSize;
|
||||
}
|
||||
|
||||
template <typename ValueT>
|
||||
size_t loadVector(std::vector<ValueT, std::allocator<ValueT> >& v,
|
||||
std::FILE* in, bool map = false)
|
||||
{
|
||||
// Can only be read into memory. Mapping not possible with std:allocator.
|
||||
assert(map == false);
|
||||
|
||||
size_t byteSize = 0;
|
||||
|
||||
size_t valSize;
|
||||
byteSize += std::fread(&valSize, sizeof(size_t), 1, in) * sizeof(size_t);
|
||||
|
||||
v.resize(valSize, 0);
|
||||
byteSize += std::fread(&v[0], sizeof(ValueT), valSize, in) * sizeof(ValueT);
|
||||
|
||||
return byteSize;
|
||||
}
|
||||
|
||||
template <typename ValueT>
|
||||
size_t loadVector(std::vector<ValueT, MmapAllocator<ValueT> >& v,
|
||||
std::FILE* in, bool map = false)
|
||||
{
|
||||
size_t byteSize = 0;
|
||||
|
||||
size_t valSize;
|
||||
byteSize += std::fread(&valSize, sizeof(size_t), 1, in) * sizeof(size_t);
|
||||
|
||||
if(map == false)
|
||||
{
|
||||
// Read data into temporary file (default constructor of MmapAllocator)
|
||||
// and map memory onto temporary file. Can be resized.
|
||||
|
||||
v.resize(valSize, 0);
|
||||
byteSize += std::fread(&v[0], sizeof(ValueT), valSize, in) * sizeof(ValueT);
|
||||
}
|
||||
else
|
||||
{
|
||||
// Map it directly on specified region of file "in" starting at valPos
|
||||
// with length valSize * sizeof(ValueT). Mapped region cannot be resized.
|
||||
|
||||
size_t valPos = std::ftell(in);
|
||||
|
||||
Allocator<ValueT> alloc(in, valPos);
|
||||
std::vector<ValueT, Allocator<ValueT> > vTemp(alloc);
|
||||
vTemp.resize(valSize);
|
||||
v.swap(vTemp);
|
||||
|
||||
std::fseek(in, valSize * sizeof(ValueT), SEEK_CUR);
|
||||
byteSize += valSize * sizeof(ValueT);
|
||||
}
|
||||
|
||||
return byteSize;
|
||||
}
|
||||
|
||||
size_t save(std::FILE* out)
|
||||
{
|
||||
if(!m_final)
|
||||
commit();
|
||||
|
||||
bool byteSize = 0;
|
||||
byteSize += fwrite(&m_final, sizeof(bool), 1, out) * sizeof(bool);
|
||||
byteSize += fwrite(&m_size, sizeof(size_t), 1, out) * sizeof(size_t);
|
||||
byteSize += fwrite(&m_last, sizeof(PosT), 1, out) * sizeof(PosT);
|
||||
|
||||
size_t size = m_diffs.size();
|
||||
byteSize += fwrite(&size, sizeof(size_t), 1, out) * sizeof(size_t);
|
||||
byteSize += fwrite(&m_diffs[0], sizeof(unsigned int), size, out) * sizeof(unsigned int);
|
||||
|
||||
size = m_anchors.size();
|
||||
byteSize += fwrite(&size, sizeof(size_t), 1, out) * sizeof(size_t);
|
||||
byteSize += fwrite(&m_anchors[0], sizeof(NumT), size, out) * sizeof(NumT);
|
||||
|
||||
return byteSize;
|
||||
}
|
||||
|
||||
void swap(MonotonicVector<PosT, NumT, stepSize, Allocator> &mv)
|
||||
{
|
||||
if(!m_final)
|
||||
commit();
|
||||
|
||||
m_diffs.swap(mv.m_diffs);
|
||||
m_anchors.swap(mv.m_anchors);
|
||||
}
|
||||
};
|
||||
|
||||
}
|
||||
#endif
|
335
moses/src/CompactPT/MurmurHash3.cpp
Normal file
335
moses/src/CompactPT/MurmurHash3.cpp
Normal file
@ -0,0 +1,335 @@
|
||||
//-----------------------------------------------------------------------------
|
||||
// MurmurHash3 was written by Austin Appleby, and is placed in the public
|
||||
// domain. The author hereby disclaims copyright to this source code.
|
||||
|
||||
// Note - The x86 and x64 versions do _not_ produce the same results, as the
|
||||
// algorithms are optimized for their respective platforms. You can still
|
||||
// compile and run any of them on any platform, but your performance with the
|
||||
// non-native version will be less than optimal.
|
||||
|
||||
#include "MurmurHash3.h"
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// Platform-specific functions and macros
|
||||
|
||||
// Microsoft Visual Studio
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
|
||||
#define FORCE_INLINE __forceinline
|
||||
|
||||
#include <stdlib.h>
|
||||
|
||||
#define ROTL32(x,y) _rotl(x,y)
|
||||
#define ROTL64(x,y) _rotl64(x,y)
|
||||
|
||||
#define BIG_CONSTANT(x) (x)
|
||||
|
||||
// Other compilers
|
||||
|
||||
#else // defined(_MSC_VER)
|
||||
|
||||
#define FORCE_INLINE __attribute__((always_inline))
|
||||
|
||||
inline uint32_t rotl32 ( uint32_t x, int8_t r )
|
||||
{
|
||||
return (x << r) | (x >> (32 - r));
|
||||
}
|
||||
|
||||
inline uint64_t rotl64 ( uint64_t x, int8_t r )
|
||||
{
|
||||
return (x << r) | (x >> (64 - r));
|
||||
}
|
||||
|
||||
#define ROTL32(x,y) rotl32(x,y)
|
||||
#define ROTL64(x,y) rotl64(x,y)
|
||||
|
||||
#define BIG_CONSTANT(x) (x##LLU)
|
||||
|
||||
#endif // !defined(_MSC_VER)
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// Block read - if your platform needs to do endian-swapping or can only
|
||||
// handle aligned reads, do the conversion here
|
||||
|
||||
FORCE_INLINE uint32_t getblock ( const uint32_t * p, int i )
|
||||
{
|
||||
return p[i];
|
||||
}
|
||||
|
||||
FORCE_INLINE uint64_t getblock ( const uint64_t * p, int i )
|
||||
{
|
||||
return p[i];
|
||||
}
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// Finalization mix - force all bits of a hash block to avalanche
|
||||
|
||||
FORCE_INLINE uint32_t fmix ( uint32_t h )
|
||||
{
|
||||
h ^= h >> 16;
|
||||
h *= 0x85ebca6b;
|
||||
h ^= h >> 13;
|
||||
h *= 0xc2b2ae35;
|
||||
h ^= h >> 16;
|
||||
|
||||
return h;
|
||||
}
|
||||
|
||||
//----------
|
||||
|
||||
FORCE_INLINE uint64_t fmix ( uint64_t k )
|
||||
{
|
||||
k ^= k >> 33;
|
||||
k *= BIG_CONSTANT(0xff51afd7ed558ccd);
|
||||
k ^= k >> 33;
|
||||
k *= BIG_CONSTANT(0xc4ceb9fe1a85ec53);
|
||||
k ^= k >> 33;
|
||||
|
||||
return k;
|
||||
}
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
|
||||
void MurmurHash3_x86_32 ( const void * key, int len,
|
||||
uint32_t seed, void * out )
|
||||
{
|
||||
const uint8_t * data = (const uint8_t*)key;
|
||||
const int nblocks = len / 4;
|
||||
|
||||
uint32_t h1 = seed;
|
||||
|
||||
uint32_t c1 = 0xcc9e2d51;
|
||||
uint32_t c2 = 0x1b873593;
|
||||
|
||||
//----------
|
||||
// body
|
||||
|
||||
const uint32_t * blocks = (const uint32_t *)(data + nblocks*4);
|
||||
|
||||
for(int i = -nblocks; i; i++)
|
||||
{
|
||||
uint32_t k1 = getblock(blocks,i);
|
||||
|
||||
k1 *= c1;
|
||||
k1 = ROTL32(k1,15);
|
||||
k1 *= c2;
|
||||
|
||||
h1 ^= k1;
|
||||
h1 = ROTL32(h1,13);
|
||||
h1 = h1*5+0xe6546b64;
|
||||
}
|
||||
|
||||
//----------
|
||||
// tail
|
||||
|
||||
const uint8_t * tail = (const uint8_t*)(data + nblocks*4);
|
||||
|
||||
uint32_t k1 = 0;
|
||||
|
||||
switch(len & 3)
|
||||
{
|
||||
case 3: k1 ^= tail[2] << 16;
|
||||
case 2: k1 ^= tail[1] << 8;
|
||||
case 1: k1 ^= tail[0];
|
||||
k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1;
|
||||
};
|
||||
|
||||
//----------
|
||||
// finalization
|
||||
|
||||
h1 ^= len;
|
||||
|
||||
h1 = fmix(h1);
|
||||
|
||||
*(uint32_t*)out = h1;
|
||||
}
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
|
||||
void MurmurHash3_x86_128 ( const void * key, const int len,
|
||||
uint32_t seed, void * out )
|
||||
{
|
||||
const uint8_t * data = (const uint8_t*)key;
|
||||
const int nblocks = len / 16;
|
||||
|
||||
uint32_t h1 = seed;
|
||||
uint32_t h2 = seed;
|
||||
uint32_t h3 = seed;
|
||||
uint32_t h4 = seed;
|
||||
|
||||
uint32_t c1 = 0x239b961b;
|
||||
uint32_t c2 = 0xab0e9789;
|
||||
uint32_t c3 = 0x38b34ae5;
|
||||
uint32_t c4 = 0xa1e38b93;
|
||||
|
||||
//----------
|
||||
// body
|
||||
|
||||
const uint32_t * blocks = (const uint32_t *)(data + nblocks*16);
|
||||
|
||||
for(int i = -nblocks; i; i++)
|
||||
{
|
||||
uint32_t k1 = getblock(blocks,i*4+0);
|
||||
uint32_t k2 = getblock(blocks,i*4+1);
|
||||
uint32_t k3 = getblock(blocks,i*4+2);
|
||||
uint32_t k4 = getblock(blocks,i*4+3);
|
||||
|
||||
k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1;
|
||||
|
||||
h1 = ROTL32(h1,19); h1 += h2; h1 = h1*5+0x561ccd1b;
|
||||
|
||||
k2 *= c2; k2 = ROTL32(k2,16); k2 *= c3; h2 ^= k2;
|
||||
|
||||
h2 = ROTL32(h2,17); h2 += h3; h2 = h2*5+0x0bcaa747;
|
||||
|
||||
k3 *= c3; k3 = ROTL32(k3,17); k3 *= c4; h3 ^= k3;
|
||||
|
||||
h3 = ROTL32(h3,15); h3 += h4; h3 = h3*5+0x96cd1c35;
|
||||
|
||||
k4 *= c4; k4 = ROTL32(k4,18); k4 *= c1; h4 ^= k4;
|
||||
|
||||
h4 = ROTL32(h4,13); h4 += h1; h4 = h4*5+0x32ac3b17;
|
||||
}
|
||||
|
||||
//----------
|
||||
// tail
|
||||
|
||||
const uint8_t * tail = (const uint8_t*)(data + nblocks*16);
|
||||
|
||||
uint32_t k1 = 0;
|
||||
uint32_t k2 = 0;
|
||||
uint32_t k3 = 0;
|
||||
uint32_t k4 = 0;
|
||||
|
||||
switch(len & 15)
|
||||
{
|
||||
case 15: k4 ^= tail[14] << 16;
|
||||
case 14: k4 ^= tail[13] << 8;
|
||||
case 13: k4 ^= tail[12] << 0;
|
||||
k4 *= c4; k4 = ROTL32(k4,18); k4 *= c1; h4 ^= k4;
|
||||
|
||||
case 12: k3 ^= tail[11] << 24;
|
||||
case 11: k3 ^= tail[10] << 16;
|
||||
case 10: k3 ^= tail[ 9] << 8;
|
||||
case 9: k3 ^= tail[ 8] << 0;
|
||||
k3 *= c3; k3 = ROTL32(k3,17); k3 *= c4; h3 ^= k3;
|
||||
|
||||
case 8: k2 ^= tail[ 7] << 24;
|
||||
case 7: k2 ^= tail[ 6] << 16;
|
||||
case 6: k2 ^= tail[ 5] << 8;
|
||||
case 5: k2 ^= tail[ 4] << 0;
|
||||
k2 *= c2; k2 = ROTL32(k2,16); k2 *= c3; h2 ^= k2;
|
||||
|
||||
case 4: k1 ^= tail[ 3] << 24;
|
||||
case 3: k1 ^= tail[ 2] << 16;
|
||||
case 2: k1 ^= tail[ 1] << 8;
|
||||
case 1: k1 ^= tail[ 0] << 0;
|
||||
k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1;
|
||||
};
|
||||
|
||||
//----------
|
||||
// finalization
|
||||
|
||||
h1 ^= len; h2 ^= len; h3 ^= len; h4 ^= len;
|
||||
|
||||
h1 += h2; h1 += h3; h1 += h4;
|
||||
h2 += h1; h3 += h1; h4 += h1;
|
||||
|
||||
h1 = fmix(h1);
|
||||
h2 = fmix(h2);
|
||||
h3 = fmix(h3);
|
||||
h4 = fmix(h4);
|
||||
|
||||
h1 += h2; h1 += h3; h1 += h4;
|
||||
h2 += h1; h3 += h1; h4 += h1;
|
||||
|
||||
((uint32_t*)out)[0] = h1;
|
||||
((uint32_t*)out)[1] = h2;
|
||||
((uint32_t*)out)[2] = h3;
|
||||
((uint32_t*)out)[3] = h4;
|
||||
}
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
|
||||
void MurmurHash3_x64_128 ( const void * key, const int len,
|
||||
const uint32_t seed, void * out )
|
||||
{
|
||||
const uint8_t * data = (const uint8_t*)key;
|
||||
const int nblocks = len / 16;
|
||||
|
||||
uint64_t h1 = seed;
|
||||
uint64_t h2 = seed;
|
||||
|
||||
uint64_t c1 = BIG_CONSTANT(0x87c37b91114253d5);
|
||||
uint64_t c2 = BIG_CONSTANT(0x4cf5ad432745937f);
|
||||
|
||||
//----------
|
||||
// body
|
||||
|
||||
const uint64_t * blocks = (const uint64_t *)(data);
|
||||
|
||||
for(int i = 0; i < nblocks; i++)
|
||||
{
|
||||
uint64_t k1 = getblock(blocks,i*2+0);
|
||||
uint64_t k2 = getblock(blocks,i*2+1);
|
||||
|
||||
k1 *= c1; k1 = ROTL64(k1,31); k1 *= c2; h1 ^= k1;
|
||||
|
||||
h1 = ROTL64(h1,27); h1 += h2; h1 = h1*5+0x52dce729;
|
||||
|
||||
k2 *= c2; k2 = ROTL64(k2,33); k2 *= c1; h2 ^= k2;
|
||||
|
||||
h2 = ROTL64(h2,31); h2 += h1; h2 = h2*5+0x38495ab5;
|
||||
}
|
||||
|
||||
//----------
|
||||
// tail
|
||||
|
||||
const uint8_t * tail = (const uint8_t*)(data + nblocks*16);
|
||||
|
||||
uint64_t k1 = 0;
|
||||
uint64_t k2 = 0;
|
||||
|
||||
switch(len & 15)
|
||||
{
|
||||
case 15: k2 ^= uint64_t(tail[14]) << 48;
|
||||
case 14: k2 ^= uint64_t(tail[13]) << 40;
|
||||
case 13: k2 ^= uint64_t(tail[12]) << 32;
|
||||
case 12: k2 ^= uint64_t(tail[11]) << 24;
|
||||
case 11: k2 ^= uint64_t(tail[10]) << 16;
|
||||
case 10: k2 ^= uint64_t(tail[ 9]) << 8;
|
||||
case 9: k2 ^= uint64_t(tail[ 8]) << 0;
|
||||
k2 *= c2; k2 = ROTL64(k2,33); k2 *= c1; h2 ^= k2;
|
||||
|
||||
case 8: k1 ^= uint64_t(tail[ 7]) << 56;
|
||||
case 7: k1 ^= uint64_t(tail[ 6]) << 48;
|
||||
case 6: k1 ^= uint64_t(tail[ 5]) << 40;
|
||||
case 5: k1 ^= uint64_t(tail[ 4]) << 32;
|
||||
case 4: k1 ^= uint64_t(tail[ 3]) << 24;
|
||||
case 3: k1 ^= uint64_t(tail[ 2]) << 16;
|
||||
case 2: k1 ^= uint64_t(tail[ 1]) << 8;
|
||||
case 1: k1 ^= uint64_t(tail[ 0]) << 0;
|
||||
k1 *= c1; k1 = ROTL64(k1,31); k1 *= c2; h1 ^= k1;
|
||||
};
|
||||
|
||||
//----------
|
||||
// finalization
|
||||
|
||||
h1 ^= len; h2 ^= len;
|
||||
|
||||
h1 += h2;
|
||||
h2 += h1;
|
||||
|
||||
h1 = fmix(h1);
|
||||
h2 = fmix(h2);
|
||||
|
||||
h1 += h2;
|
||||
h2 += h1;
|
||||
|
||||
((uint64_t*)out)[0] = h1;
|
||||
((uint64_t*)out)[1] = h2;
|
||||
}
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
|
37
moses/src/CompactPT/MurmurHash3.h
Normal file
37
moses/src/CompactPT/MurmurHash3.h
Normal file
@ -0,0 +1,37 @@
|
||||
//-----------------------------------------------------------------------------
|
||||
// MurmurHash3 was written by Austin Appleby, and is placed in the public
|
||||
// domain. The author hereby disclaims copyright to this source code.
|
||||
|
||||
#ifndef _MURMURHASH3_H_
|
||||
#define _MURMURHASH3_H_
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// Platform-specific functions and macros
|
||||
|
||||
// Microsoft Visual Studio
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
|
||||
typedef unsigned char uint8_t;
|
||||
typedef unsigned long uint32_t;
|
||||
typedef unsigned __int64 uint64_t;
|
||||
|
||||
// Other compilers
|
||||
|
||||
#else // defined(_MSC_VER)
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#endif // !defined(_MSC_VER)
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
|
||||
void MurmurHash3_x86_32 ( const void * key, int len, uint32_t seed, void * out );
|
||||
|
||||
void MurmurHash3_x86_128 ( const void * key, int len, uint32_t seed, void * out );
|
||||
|
||||
void MurmurHash3_x64_128 ( const void * key, int len, uint32_t seed, void * out );
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
|
||||
#endif // _MURMURHASH3_H_
|
177
moses/src/CompactPT/PackedArray.h
Normal file
177
moses/src/CompactPT/PackedArray.h
Normal file
@ -0,0 +1,177 @@
|
||||
#ifndef moses_PackedArray_h
|
||||
#define moses_PackedArray_h
|
||||
|
||||
#include <vector>
|
||||
#include <cmath>
|
||||
#include <cstring>
|
||||
#include <cstdio>
|
||||
|
||||
namespace Moses
|
||||
{
|
||||
|
||||
template <typename T = size_t, typename D = unsigned char>
|
||||
class PackedArray
|
||||
{
|
||||
protected:
|
||||
static size_t m_dataBits;
|
||||
|
||||
size_t m_size;
|
||||
size_t m_storageSize;
|
||||
D* m_storage;
|
||||
|
||||
public:
|
||||
PackedArray()
|
||||
{
|
||||
m_size = 0;
|
||||
m_storageSize = 0;
|
||||
m_storage = new D[0];
|
||||
}
|
||||
|
||||
PackedArray(size_t size, size_t bits) : m_size(size)
|
||||
{
|
||||
m_storageSize = ceil(float(bits * size) / float(m_dataBits));
|
||||
m_storage = new D[m_storageSize];
|
||||
}
|
||||
|
||||
PackedArray(const PackedArray<T, D> &c)
|
||||
{
|
||||
m_size = c.m_size;
|
||||
|
||||
m_storageSize = c.m_storageSize;
|
||||
m_storage = new D[m_storageSize];
|
||||
|
||||
std::memcpy(m_storage, c.m_storage, m_storageSize * sizeof(D));
|
||||
}
|
||||
|
||||
~PackedArray()
|
||||
{
|
||||
delete [] m_storage;
|
||||
m_size = 0;
|
||||
m_storageSize = 0;
|
||||
m_storage = 0;
|
||||
}
|
||||
|
||||
T Get(size_t i, size_t bits) const
|
||||
{
|
||||
T out = 0;
|
||||
|
||||
size_t bitstart = (i * bits);
|
||||
size_t bitpos = bitstart;
|
||||
|
||||
size_t zero = ((1ul << (bits)) - 1);
|
||||
|
||||
while(bitpos - bitstart < bits) {
|
||||
size_t pos = bitpos / m_dataBits;
|
||||
size_t off = bitpos % m_dataBits;
|
||||
|
||||
out |= (T(m_storage[pos]) << (bitpos - bitstart)) >> off;
|
||||
|
||||
bitpos += (m_dataBits - off);
|
||||
}
|
||||
|
||||
out &= zero;
|
||||
return out;
|
||||
}
|
||||
|
||||
void Set(size_t i, T v, size_t bits)
|
||||
{
|
||||
size_t bitstart = (i * bits);
|
||||
size_t bitpos = bitstart;
|
||||
|
||||
while(bitpos - bitstart < bits) {
|
||||
size_t pos = bitpos / m_dataBits;
|
||||
size_t off = bitpos % m_dataBits;
|
||||
|
||||
size_t rest = bits - (bitpos - bitstart);
|
||||
D zero = ~((1ul << (rest + off)) - 1) | ((1ul << off) - 1);
|
||||
|
||||
m_storage[pos] &= zero;
|
||||
m_storage[pos] |= v << off;
|
||||
v = v >> (m_dataBits - off);
|
||||
bitpos += (m_dataBits - off);
|
||||
}
|
||||
}
|
||||
|
||||
virtual D*& GetStorage()
|
||||
{
|
||||
return m_storage;
|
||||
}
|
||||
|
||||
virtual size_t GetStorageSize() const
|
||||
{
|
||||
return m_storageSize;
|
||||
}
|
||||
|
||||
virtual size_t Size() const
|
||||
{
|
||||
return m_size;
|
||||
}
|
||||
|
||||
virtual size_t Load(std::FILE* in)
|
||||
{
|
||||
size_t a1 = std::ftell(in);
|
||||
|
||||
std::fread(&m_size, sizeof(m_size), 1, in);
|
||||
std::fread(&m_storageSize, sizeof(m_storageSize), 1, in);
|
||||
delete [] m_storage;
|
||||
m_storage = new D[m_storageSize];
|
||||
std::fread(m_storage, sizeof(D), m_storageSize, in);
|
||||
|
||||
size_t a2 = std::ftell(in);
|
||||
return a2 - a1;
|
||||
}
|
||||
|
||||
virtual size_t Save(std::FILE* out)
|
||||
{
|
||||
size_t a1 = std::ftell(out);
|
||||
|
||||
std::fwrite(&m_size, sizeof(m_size), 1, out);
|
||||
std::fwrite(&m_storageSize, sizeof(m_storageSize), 1, out);
|
||||
std::fwrite(m_storage, sizeof(D), m_storageSize, out);
|
||||
|
||||
size_t a2 = std::ftell(out);
|
||||
return a2 - a1;
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
template <typename T, typename D>
|
||||
size_t PackedArray<T, D>::m_dataBits = sizeof(D)*8;
|
||||
|
||||
/**************************************************************************/
|
||||
|
||||
template <typename T = size_t, typename D = unsigned char>
|
||||
class PairedPackedArray : public PackedArray<T,D>
|
||||
{
|
||||
public:
|
||||
PairedPackedArray() : PackedArray<T,D>() {}
|
||||
|
||||
PairedPackedArray(size_t size, size_t bits1, size_t bits2)
|
||||
: PackedArray<T, D>(size, bits1 + bits2) { }
|
||||
|
||||
void Set(size_t i, T a, T b, size_t bits1, size_t bits2)
|
||||
{
|
||||
T c = 0;
|
||||
c = a | (b << bits1);
|
||||
PackedArray<T,D>::Set(i, c, bits1 + bits2);
|
||||
}
|
||||
|
||||
void Set(size_t i, std::pair<T,T> p, size_t bits1, size_t bits2)
|
||||
{
|
||||
T c = 0;
|
||||
c = p.second | (p.first << bits1);
|
||||
PackedArray<T, D>::Set(i, c);
|
||||
}
|
||||
|
||||
std::pair<T, T> Get(size_t i, size_t bits1, size_t bits2)
|
||||
{
|
||||
T v = PackedArray<T, D>::Get(i, bits1 + bits2);
|
||||
T a = v & ((1 << bits1) - 1);
|
||||
T b = v >> bits1;
|
||||
return std::pair<T, T>(a, b);
|
||||
}
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
#endif
|
469
moses/src/CompactPT/PhraseDecoder.cpp
Normal file
469
moses/src/CompactPT/PhraseDecoder.cpp
Normal file
@ -0,0 +1,469 @@
|
||||
#include <deque>
|
||||
|
||||
#include "PhraseDecoder.h"
|
||||
|
||||
namespace Moses
|
||||
{
|
||||
|
||||
PhraseDecoder::PhraseDecoder(
|
||||
PhraseDictionaryCompact &phraseDictionary,
|
||||
const std::vector<FactorType>* &input,
|
||||
const std::vector<FactorType>* &output,
|
||||
const PhraseDictionaryFeature* feature,
|
||||
size_t numScoreComponent,
|
||||
const std::vector<float>* weight,
|
||||
float weightWP,
|
||||
const LMList* languageModels
|
||||
)
|
||||
: m_coding(None), m_numScoreComponent(numScoreComponent),
|
||||
m_containsAlignmentInfo(true), m_maxRank(0),
|
||||
m_symbolTree(0), m_multipleScoreTrees(false),
|
||||
m_scoreTrees(1), m_alignTree(0),
|
||||
m_phraseDictionary(phraseDictionary), m_input(input), m_output(output),
|
||||
m_feature(feature), m_weight(weight),
|
||||
m_weightWP(weightWP), m_languageModels(languageModels),
|
||||
m_separator(" ||| ")
|
||||
{ }
|
||||
|
||||
PhraseDecoder::~PhraseDecoder()
|
||||
{
|
||||
if(m_symbolTree)
|
||||
delete m_symbolTree;
|
||||
|
||||
for(size_t i = 0; i < m_scoreTrees.size(); i++)
|
||||
if(m_scoreTrees[i])
|
||||
delete m_scoreTrees[i];
|
||||
|
||||
if(m_alignTree)
|
||||
delete m_alignTree;
|
||||
}
|
||||
|
||||
inline unsigned PhraseDecoder::GetSourceSymbolId(std::string& symbol)
|
||||
{
|
||||
boost::unordered_map<std::string, unsigned>::iterator it
|
||||
= m_sourceSymbolsMap.find(symbol);
|
||||
if(it != m_sourceSymbolsMap.end())
|
||||
return it->second;
|
||||
|
||||
size_t idx = m_sourceSymbols.find(symbol);
|
||||
m_sourceSymbolsMap[symbol] = idx;
|
||||
return idx;
|
||||
}
|
||||
|
||||
inline std::string PhraseDecoder::GetTargetSymbol(unsigned idx) const
|
||||
{
|
||||
if(idx < m_targetSymbols.size())
|
||||
return m_targetSymbols[idx];
|
||||
return std::string("##ERROR##");
|
||||
}
|
||||
|
||||
inline size_t PhraseDecoder::GetREncType(unsigned encodedSymbol)
|
||||
{
|
||||
return (encodedSymbol >> 30) + 1;
|
||||
}
|
||||
|
||||
inline size_t PhraseDecoder::GetPREncType(unsigned encodedSymbol)
|
||||
{
|
||||
return (encodedSymbol >> 31) + 1;
|
||||
}
|
||||
|
||||
inline unsigned PhraseDecoder::GetTranslation(unsigned srcIdx, size_t rank)
|
||||
{
|
||||
size_t srcTrgIdx = m_lexicalTableIndex[srcIdx];
|
||||
return m_lexicalTable[srcTrgIdx + rank].second;
|
||||
}
|
||||
|
||||
size_t PhraseDecoder::GetMaxSourcePhraseLength()
|
||||
{
|
||||
return m_maxPhraseLength;
|
||||
}
|
||||
|
||||
inline unsigned PhraseDecoder::DecodeREncSymbol1(unsigned encodedSymbol)
|
||||
{
|
||||
return encodedSymbol &= ~(3 << 30);
|
||||
}
|
||||
|
||||
inline unsigned PhraseDecoder::DecodeREncSymbol2Rank(unsigned encodedSymbol)
|
||||
{
|
||||
return encodedSymbol &= ~(255 << 24);
|
||||
}
|
||||
|
||||
inline unsigned PhraseDecoder::DecodeREncSymbol2Position(unsigned encodedSymbol)
|
||||
{
|
||||
encodedSymbol &= ~(3 << 30);
|
||||
encodedSymbol >>= 24;
|
||||
return encodedSymbol;
|
||||
}
|
||||
|
||||
inline unsigned PhraseDecoder::DecodeREncSymbol3(unsigned encodedSymbol)
|
||||
{
|
||||
return encodedSymbol &= ~(3 << 30);
|
||||
}
|
||||
|
||||
inline unsigned PhraseDecoder::DecodePREncSymbol1(unsigned encodedSymbol)
|
||||
{
|
||||
return encodedSymbol &= ~(1 << 31);
|
||||
}
|
||||
|
||||
inline int PhraseDecoder::DecodePREncSymbol2Left(unsigned encodedSymbol)
|
||||
{
|
||||
return ((encodedSymbol >> 25) & 63) - 32;
|
||||
}
|
||||
|
||||
inline int PhraseDecoder::DecodePREncSymbol2Right(unsigned encodedSymbol)
|
||||
{
|
||||
return ((encodedSymbol >> 19) & 63) - 32;
|
||||
}
|
||||
|
||||
inline unsigned PhraseDecoder::DecodePREncSymbol2Rank(unsigned encodedSymbol)
|
||||
{
|
||||
return (encodedSymbol & 524287);
|
||||
}
|
||||
|
||||
size_t PhraseDecoder::Load(std::FILE* in)
|
||||
{
|
||||
size_t start = std::ftell(in);
|
||||
|
||||
std::fread(&m_coding, sizeof(m_coding), 1, in);
|
||||
std::fread(&m_numScoreComponent, sizeof(m_numScoreComponent), 1, in);
|
||||
std::fread(&m_containsAlignmentInfo, sizeof(m_containsAlignmentInfo), 1, in);
|
||||
std::fread(&m_maxRank, sizeof(m_maxRank), 1, in);
|
||||
std::fread(&m_maxPhraseLength, sizeof(m_maxPhraseLength), 1, in);
|
||||
|
||||
if(m_coding == REnc)
|
||||
{
|
||||
m_sourceSymbols.load(in);
|
||||
|
||||
size_t size;
|
||||
std::fread(&size, sizeof(size_t), 1, in);
|
||||
m_lexicalTableIndex.resize(size);
|
||||
std::fread(&m_lexicalTableIndex[0], sizeof(size_t), size, in);
|
||||
|
||||
std::fread(&size, sizeof(size_t), 1, in);
|
||||
m_lexicalTable.resize(size);
|
||||
std::fread(&m_lexicalTable[0], sizeof(SrcTrg), size, in);
|
||||
}
|
||||
|
||||
m_targetSymbols.load(in);
|
||||
|
||||
m_symbolTree = new CanonicalHuffman<unsigned>(in);
|
||||
|
||||
std::fread(&m_multipleScoreTrees, sizeof(m_multipleScoreTrees), 1, in);
|
||||
if(m_multipleScoreTrees)
|
||||
{
|
||||
m_scoreTrees.resize(m_numScoreComponent);
|
||||
for(size_t i = 0; i < m_numScoreComponent; i++)
|
||||
m_scoreTrees[i] = new CanonicalHuffman<float>(in);
|
||||
}
|
||||
else
|
||||
{
|
||||
m_scoreTrees.resize(1);
|
||||
m_scoreTrees[0] = new CanonicalHuffman<float>(in);
|
||||
}
|
||||
|
||||
if(m_containsAlignmentInfo)
|
||||
m_alignTree = new CanonicalHuffman<AlignPoint>(in);
|
||||
|
||||
size_t end = std::ftell(in);
|
||||
return end - start;
|
||||
}
|
||||
|
||||
std::string PhraseDecoder::MakeSourceKey(std::string &source)
|
||||
{
|
||||
return source + m_separator;
|
||||
}
|
||||
|
||||
TargetPhraseVectorPtr PhraseDecoder::CreateTargetPhraseCollection(const Phrase &sourcePhrase, bool topLevel)
|
||||
{
|
||||
|
||||
// Not using TargetPhraseCollection avoiding "new" operator
|
||||
// which can introduce heavy locking with multiple threads
|
||||
TargetPhraseVectorPtr tpv(new TargetPhraseVector());
|
||||
size_t bitsLeft = 0;
|
||||
|
||||
if(m_coding == PREnc)
|
||||
{
|
||||
std::pair<TargetPhraseVectorPtr, size_t> cachedPhraseColl
|
||||
= m_decodingCache.Retrieve(sourcePhrase);
|
||||
|
||||
// Has been cached and is complete or does not need to be completed
|
||||
if(cachedPhraseColl.first != NULL && (!topLevel || cachedPhraseColl.second == 0))
|
||||
return cachedPhraseColl.first;
|
||||
|
||||
// Has been cached, but is incomplete
|
||||
else if(cachedPhraseColl.first != NULL)
|
||||
{
|
||||
bitsLeft = cachedPhraseColl.second;
|
||||
tpv->resize(cachedPhraseColl.first->size());
|
||||
std::copy(cachedPhraseColl.first->begin(),
|
||||
cachedPhraseColl.first->end(),
|
||||
tpv->begin());
|
||||
}
|
||||
}
|
||||
|
||||
// Retrieve source phrase identifier
|
||||
std::string sourcePhraseString = sourcePhrase.GetStringRep(*m_input);
|
||||
size_t sourcePhraseId = m_phraseDictionary.m_hash[MakeSourceKey(sourcePhraseString)];
|
||||
|
||||
if(sourcePhraseId != m_phraseDictionary.m_hash.GetSize())
|
||||
{
|
||||
// Retrieve compressed and encoded target phrase collection
|
||||
std::string encodedPhraseCollection;
|
||||
if(m_phraseDictionary.m_inMemory)
|
||||
encodedPhraseCollection = m_phraseDictionary.m_targetPhrasesMemory[sourcePhraseId];
|
||||
else
|
||||
encodedPhraseCollection = m_phraseDictionary.m_targetPhrasesMapped[sourcePhraseId];
|
||||
|
||||
BitStream<> encodedBitStream(encodedPhraseCollection);
|
||||
if(m_coding == PREnc && bitsLeft)
|
||||
encodedBitStream.SetLeft(bitsLeft);
|
||||
|
||||
// Decompress and decode target phrase collection
|
||||
TargetPhraseVectorPtr decodedPhraseColl =
|
||||
DecodeCollection(tpv, encodedBitStream, sourcePhrase, topLevel);
|
||||
|
||||
return decodedPhraseColl;
|
||||
}
|
||||
else
|
||||
return TargetPhraseVectorPtr();
|
||||
}
|
||||
|
||||
TargetPhraseVectorPtr PhraseDecoder::DecodeCollection(
|
||||
TargetPhraseVectorPtr tpv, BitStream<> &encodedBitStream,
|
||||
const Phrase &sourcePhrase, bool topLevel)
|
||||
{
|
||||
|
||||
bool extending = tpv->size();
|
||||
size_t bitsLeft = encodedBitStream.RemainingBits();
|
||||
|
||||
typedef std::pair<size_t, size_t> AlignPointSizeT;
|
||||
|
||||
std::vector<int> sourceWords;
|
||||
if(m_coding == REnc)
|
||||
{
|
||||
for(size_t i = 0; i < sourcePhrase.GetSize(); i++)
|
||||
{
|
||||
std::string sourceWord
|
||||
= sourcePhrase.GetWord(i).GetString(*m_input, false);
|
||||
unsigned idx = GetSourceSymbolId(sourceWord);
|
||||
sourceWords.push_back(idx);
|
||||
}
|
||||
}
|
||||
|
||||
unsigned phraseStopSymbol = 0;
|
||||
AlignPoint alignStopSymbol(-1, -1);
|
||||
|
||||
std::vector<float> scores;
|
||||
std::set<AlignPointSizeT> alignment;
|
||||
|
||||
enum DecodeState { New, Symbol, Score, Alignment, Add } state = New;
|
||||
|
||||
size_t srcSize = sourcePhrase.GetSize();
|
||||
|
||||
TargetPhrase* targetPhrase = NULL;
|
||||
while(encodedBitStream.RemainingBits())
|
||||
{
|
||||
|
||||
if(state == New)
|
||||
{
|
||||
// Creating new TargetPhrase on the heap
|
||||
tpv->push_back(TargetPhrase(Output));
|
||||
targetPhrase = &tpv->back();
|
||||
|
||||
targetPhrase->SetSourcePhrase(&sourcePhrase);
|
||||
alignment.clear();
|
||||
scores.clear();
|
||||
|
||||
state = Symbol;
|
||||
}
|
||||
|
||||
if(state == Symbol)
|
||||
{
|
||||
unsigned symbol = m_symbolTree->NextSymbol(encodedBitStream);
|
||||
|
||||
if(symbol == phraseStopSymbol)
|
||||
{
|
||||
state = Score;
|
||||
}
|
||||
else
|
||||
{
|
||||
if(m_coding == REnc)
|
||||
{
|
||||
std::string wordString;
|
||||
size_t type = GetREncType(symbol);
|
||||
|
||||
if(type == 1)
|
||||
{
|
||||
unsigned decodedSymbol = DecodeREncSymbol1(symbol);
|
||||
wordString = GetTargetSymbol(decodedSymbol);
|
||||
}
|
||||
else if (type == 2)
|
||||
{
|
||||
size_t rank = DecodeREncSymbol2Rank(symbol);
|
||||
size_t srcPos = DecodeREncSymbol2Position(symbol);
|
||||
|
||||
if(srcPos >= sourceWords.size())
|
||||
return TargetPhraseVectorPtr();
|
||||
|
||||
wordString = GetTargetSymbol(GetTranslation(sourceWords[srcPos], rank));
|
||||
if(StaticData::Instance().UseAlignmentInfo())
|
||||
{
|
||||
size_t trgPos = targetPhrase->GetSize();
|
||||
alignment.insert(AlignPoint(srcPos, trgPos));
|
||||
}
|
||||
}
|
||||
else if(type == 3)
|
||||
{
|
||||
size_t rank = DecodeREncSymbol3(symbol);
|
||||
size_t srcPos = targetPhrase->GetSize();
|
||||
|
||||
if(srcPos >= sourceWords.size())
|
||||
return TargetPhraseVectorPtr();
|
||||
|
||||
wordString = GetTargetSymbol(GetTranslation(sourceWords[srcPos], rank));
|
||||
if(StaticData::Instance().UseAlignmentInfo())
|
||||
{
|
||||
size_t trgPos = srcPos;
|
||||
alignment.insert(AlignPoint(srcPos, trgPos));
|
||||
}
|
||||
}
|
||||
|
||||
Word word;
|
||||
word.CreateFromString(Output, *m_output, wordString, false);
|
||||
targetPhrase->AddWord(word);
|
||||
}
|
||||
else if(m_coding == PREnc)
|
||||
{
|
||||
// if the symbol is just a word
|
||||
if(GetPREncType(symbol) == 1)
|
||||
{
|
||||
unsigned decodedSymbol = DecodePREncSymbol1(symbol);
|
||||
Word word;
|
||||
word.CreateFromString(Output, *m_output,
|
||||
GetTargetSymbol(decodedSymbol), false);
|
||||
targetPhrase->AddWord(word);
|
||||
}
|
||||
// if the symbol is a subphrase pointer
|
||||
else
|
||||
{
|
||||
int left = DecodePREncSymbol2Left(symbol);
|
||||
int right = DecodePREncSymbol2Right(symbol);
|
||||
unsigned rank = DecodePREncSymbol2Rank(symbol);
|
||||
|
||||
int srcStart = left + targetPhrase->GetSize();
|
||||
int srcEnd = srcSize - right - 1;
|
||||
|
||||
// false positive consistency check
|
||||
if(0 > srcStart || srcStart > srcEnd || unsigned(srcEnd) >= srcSize)
|
||||
return TargetPhraseVectorPtr();
|
||||
|
||||
// false positive consistency check
|
||||
if(m_maxRank && rank > m_maxRank)
|
||||
return TargetPhraseVectorPtr();
|
||||
|
||||
// set subphrase by default to itself
|
||||
TargetPhraseVectorPtr subTpv = tpv;
|
||||
|
||||
// if range smaller than source phrase retrieve subphrase
|
||||
if(unsigned(srcEnd - srcStart + 1) != srcSize)
|
||||
{
|
||||
Phrase subPhrase = sourcePhrase.GetSubString(WordsRange(srcStart, srcEnd));
|
||||
subTpv = CreateTargetPhraseCollection(subPhrase, false);
|
||||
}
|
||||
|
||||
// false positive consistency check
|
||||
if(subTpv != NULL && rank < subTpv->size())
|
||||
{
|
||||
// insert the subphrase into the main target phrase
|
||||
TargetPhrase& subTp = subTpv->at(rank);
|
||||
if(StaticData::Instance().UseAlignmentInfo())
|
||||
{
|
||||
// reconstruct the alignment data based on the alignment of the subphrase
|
||||
for(AlignmentInfo::const_iterator it = subTp.GetAlignmentInfo().begin();
|
||||
it != subTp.GetAlignmentInfo().end(); it++)
|
||||
{
|
||||
alignment.insert(AlignPointSizeT(srcStart + it->first,
|
||||
targetPhrase->GetSize() + it->second));
|
||||
}
|
||||
}
|
||||
targetPhrase->Append(subTp);
|
||||
}
|
||||
else
|
||||
return TargetPhraseVectorPtr();
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
Word word;
|
||||
word.CreateFromString(Output, *m_output,
|
||||
GetTargetSymbol(symbol), false);
|
||||
targetPhrase->AddWord(word);
|
||||
}
|
||||
}
|
||||
}
|
||||
else if(state == Score)
|
||||
{
|
||||
size_t idx = m_multipleScoreTrees ? scores.size() : 0;
|
||||
float score = m_scoreTrees[idx]->NextSymbol(encodedBitStream);
|
||||
scores.push_back(score);
|
||||
|
||||
if(scores.size() == m_numScoreComponent)
|
||||
{
|
||||
targetPhrase->SetScore(m_feature, scores, *m_weight, m_weightWP, *m_languageModels);
|
||||
|
||||
if(m_containsAlignmentInfo)
|
||||
state = Alignment;
|
||||
else
|
||||
state = Add;
|
||||
}
|
||||
}
|
||||
else if(state == Alignment)
|
||||
{
|
||||
AlignPoint alignPoint = m_alignTree->NextSymbol(encodedBitStream);
|
||||
if(alignPoint == alignStopSymbol)
|
||||
{
|
||||
state = Add;
|
||||
}
|
||||
else
|
||||
{
|
||||
if(StaticData::Instance().UseAlignmentInfo())
|
||||
alignment.insert(AlignPointSizeT(alignPoint));
|
||||
}
|
||||
}
|
||||
|
||||
if(state == Add)
|
||||
{
|
||||
if(StaticData::Instance().UseAlignmentInfo())
|
||||
targetPhrase->SetAlignmentInfo(alignment);
|
||||
|
||||
if(m_coding == PREnc)
|
||||
{
|
||||
if(!m_maxRank || tpv->size() <= m_maxRank)
|
||||
bitsLeft = encodedBitStream.RemainingBits();
|
||||
|
||||
if(!topLevel && m_maxRank && tpv->size() >= m_maxRank)
|
||||
break;
|
||||
}
|
||||
|
||||
if(encodedBitStream.RemainingBits() <= 8)
|
||||
break;
|
||||
|
||||
state = New;
|
||||
}
|
||||
}
|
||||
|
||||
if(m_coding == PREnc && !extending)
|
||||
{
|
||||
bitsLeft = bitsLeft > 8 ? bitsLeft : 0;
|
||||
m_decodingCache.Cache(sourcePhrase, tpv, bitsLeft, m_maxRank);
|
||||
}
|
||||
|
||||
return tpv;
|
||||
}
|
||||
|
||||
void PhraseDecoder::PruneCache()
|
||||
{
|
||||
m_decodingCache.Prune();
|
||||
}
|
||||
|
||||
}
|
133
moses/src/CompactPT/PhraseDecoder.h
Normal file
133
moses/src/CompactPT/PhraseDecoder.h
Normal file
@ -0,0 +1,133 @@
|
||||
#ifndef moses_PhraseDecoder_h
|
||||
#define moses_PhraseDecoder_h
|
||||
|
||||
#include <sstream>
|
||||
#include <vector>
|
||||
#include <boost/unordered_map.hpp>
|
||||
#include <boost/unordered_set.hpp>
|
||||
#include <fstream>
|
||||
#include <string>
|
||||
#include <iterator>
|
||||
#include <algorithm>
|
||||
#include <sys/stat.h>
|
||||
|
||||
#include "TypeDef.h"
|
||||
#include "FactorCollection.h"
|
||||
#include "Word.h"
|
||||
#include "Util.h"
|
||||
#include "InputFileStream.h"
|
||||
#include "StaticData.h"
|
||||
#include "WordsRange.h"
|
||||
#include "UserMessage.h"
|
||||
|
||||
#include "PhraseDictionaryCompact.h"
|
||||
#include "StringVector.h"
|
||||
#include "CanonicalHuffman.h"
|
||||
#include "ConsistantPhrases.h"
|
||||
#include "TargetPhraseCollectionCache.h"
|
||||
|
||||
namespace Moses
|
||||
{
|
||||
|
||||
class PhraseDictionaryCompact;
|
||||
|
||||
class PhraseDecoder
|
||||
{
|
||||
protected:
|
||||
|
||||
friend class PhraseDictionaryCompact;
|
||||
|
||||
typedef std::pair<unsigned char, unsigned char> AlignPoint;
|
||||
typedef std::pair<unsigned, unsigned> SrcTrg;
|
||||
|
||||
enum Coding { None, REnc, PREnc } m_coding;
|
||||
|
||||
size_t m_numScoreComponent;
|
||||
bool m_containsAlignmentInfo;
|
||||
size_t m_maxRank;
|
||||
size_t m_maxPhraseLength;
|
||||
|
||||
boost::unordered_map<std::string, unsigned> m_sourceSymbolsMap;
|
||||
StringVector<unsigned char, unsigned, std::allocator> m_sourceSymbols;
|
||||
StringVector<unsigned char, unsigned, std::allocator> m_targetSymbols;
|
||||
|
||||
std::vector<size_t> m_lexicalTableIndex;
|
||||
std::vector<SrcTrg> m_lexicalTable;
|
||||
|
||||
CanonicalHuffman<unsigned>* m_symbolTree;
|
||||
|
||||
bool m_multipleScoreTrees;
|
||||
std::vector<CanonicalHuffman<float>*> m_scoreTrees;
|
||||
|
||||
CanonicalHuffman<AlignPoint>* m_alignTree;
|
||||
|
||||
TargetPhraseCollectionCache m_decodingCache;
|
||||
|
||||
PhraseDictionaryCompact& m_phraseDictionary;
|
||||
|
||||
// ***********************************************
|
||||
|
||||
const std::vector<FactorType>* m_input;
|
||||
const std::vector<FactorType>* m_output;
|
||||
const PhraseDictionaryFeature* m_feature;
|
||||
const std::vector<float>* m_weight;
|
||||
float m_weightWP;
|
||||
const LMList* m_languageModels;
|
||||
|
||||
std::string m_separator;
|
||||
|
||||
// ***********************************************
|
||||
|
||||
unsigned GetSourceSymbolId(std::string& s);
|
||||
std::string GetTargetSymbol(unsigned id) const;
|
||||
|
||||
size_t GetREncType(unsigned encodedSymbol);
|
||||
size_t GetPREncType(unsigned encodedSymbol);
|
||||
|
||||
unsigned GetTranslation(unsigned srcIdx, size_t rank);
|
||||
|
||||
size_t GetMaxSourcePhraseLength();
|
||||
|
||||
unsigned DecodeREncSymbol1(unsigned encodedSymbol);
|
||||
unsigned DecodeREncSymbol2Rank(unsigned encodedSymbol);
|
||||
unsigned DecodeREncSymbol2Position(unsigned encodedSymbol);
|
||||
unsigned DecodeREncSymbol3(unsigned encodedSymbol);
|
||||
|
||||
unsigned DecodePREncSymbol1(unsigned encodedSymbol);
|
||||
int DecodePREncSymbol2Left(unsigned encodedSymbol);
|
||||
int DecodePREncSymbol2Right(unsigned encodedSymbol);
|
||||
unsigned DecodePREncSymbol2Rank(unsigned encodedSymbol);
|
||||
|
||||
std::string MakeSourceKey(std::string &);
|
||||
|
||||
public:
|
||||
|
||||
PhraseDecoder(
|
||||
PhraseDictionaryCompact &phraseDictionary,
|
||||
const std::vector<FactorType>* &input,
|
||||
const std::vector<FactorType>* &output,
|
||||
const PhraseDictionaryFeature* feature,
|
||||
size_t numScoreComponent,
|
||||
const std::vector<float>* weight,
|
||||
float weightWP,
|
||||
const LMList* languageModels
|
||||
);
|
||||
|
||||
~PhraseDecoder();
|
||||
|
||||
size_t Load(std::FILE* in);
|
||||
|
||||
TargetPhraseVectorPtr CreateTargetPhraseCollection(const Phrase &sourcePhrase,
|
||||
bool topLevel = false);
|
||||
|
||||
TargetPhraseVectorPtr DecodeCollection(TargetPhraseVectorPtr tpv,
|
||||
BitStream<> &encodedBitStream,
|
||||
const Phrase &sourcePhrase,
|
||||
bool topLevel);
|
||||
|
||||
void PruneCache();
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
#endif
|
188
moses/src/CompactPT/PhraseDictionaryCompact.cpp
Normal file
188
moses/src/CompactPT/PhraseDictionaryCompact.cpp
Normal file
@ -0,0 +1,188 @@
|
||||
// $Id: PhraseDictionaryMemoryHashed.cpp 3908 2011-02-28 11:41:08Z pjwilliams $
|
||||
// vim:tabstop=2
|
||||
|
||||
/***********************************************************************
|
||||
Moses - factored phrase-based language decoder
|
||||
Copyright (C) 2006 University of Edinburgh
|
||||
|
||||
This library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
This library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with this library; if not, write to the Free Software
|
||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
***********************************************************************/
|
||||
|
||||
#include <fstream>
|
||||
#include <string>
|
||||
#include <iterator>
|
||||
#include <queue>
|
||||
#include <algorithm>
|
||||
#include <sys/stat.h>
|
||||
|
||||
#include "PhraseDictionaryCompact.h"
|
||||
#include "FactorCollection.h"
|
||||
#include "Word.h"
|
||||
#include "Util.h"
|
||||
#include "InputFileStream.h"
|
||||
#include "StaticData.h"
|
||||
#include "WordsRange.h"
|
||||
#include "UserMessage.h"
|
||||
#include "ThreadPool.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
namespace Moses
|
||||
{
|
||||
|
||||
bool PhraseDictionaryCompact::Load(const std::vector<FactorType> &input
|
||||
, const std::vector<FactorType> &output
|
||||
, const string &filePath
|
||||
, const vector<float> &weight
|
||||
, size_t tableLimit
|
||||
, const LMList &languageModels
|
||||
, float weightWP)
|
||||
{
|
||||
m_input = &input;
|
||||
m_output = &output;
|
||||
m_weight = &weight;
|
||||
m_tableLimit = tableLimit;
|
||||
m_languageModels = &languageModels;
|
||||
m_weightWP = weightWP;
|
||||
|
||||
std::string fullFilePath = filePath;
|
||||
|
||||
m_phraseDecoder = new PhraseDecoder(*this, m_input, m_output, m_feature,
|
||||
m_numScoreComponent, m_weight, m_weightWP,
|
||||
m_languageModels);
|
||||
|
||||
std::FILE* pFile = std::fopen(fullFilePath.c_str() , "r");
|
||||
|
||||
size_t indexSize;
|
||||
if(m_inMemory)
|
||||
// Load source phrase index into memory
|
||||
indexSize = m_hash.Load(pFile);
|
||||
else
|
||||
// Keep source phrase index on disk
|
||||
indexSize = m_hash.LoadIndex(pFile);
|
||||
|
||||
|
||||
size_t coderSize = m_phraseDecoder->Load(pFile);
|
||||
|
||||
size_t phraseSize;
|
||||
if(m_inMemory)
|
||||
// Load target phrase collections into memory
|
||||
phraseSize = m_targetPhrasesMemory.load(pFile, false);
|
||||
else
|
||||
// Keep target phrase collections on disk
|
||||
phraseSize = m_targetPhrasesMapped.load(pFile, true);
|
||||
|
||||
return indexSize && coderSize && phraseSize;
|
||||
}
|
||||
|
||||
struct CompareTargetPhrase {
|
||||
bool operator() (const TargetPhrase &a, const TargetPhrase &b) {
|
||||
return a.GetFutureScore() > b.GetFutureScore();
|
||||
}
|
||||
};
|
||||
|
||||
const TargetPhraseCollection*
|
||||
PhraseDictionaryCompact::GetTargetPhraseCollection(const Phrase &sourcePhrase) const {
|
||||
|
||||
// There is no souch source phrase if source phrase is longer than longest
|
||||
// observed source phrase during compilation
|
||||
if(sourcePhrase.GetSize() > m_phraseDecoder->GetMaxSourcePhraseLength())
|
||||
return NULL;
|
||||
|
||||
// Retrieve target phrase collection from phrase table
|
||||
TargetPhraseVectorPtr decodedPhraseColl
|
||||
= m_phraseDecoder->CreateTargetPhraseCollection(sourcePhrase, true);
|
||||
|
||||
if(decodedPhraseColl != NULL && decodedPhraseColl->size()) {
|
||||
TargetPhraseVectorPtr tpv(new TargetPhraseVector(*decodedPhraseColl));
|
||||
TargetPhraseCollection* phraseColl = new TargetPhraseCollection();
|
||||
|
||||
// Score phrases and if possible apply ttable_limit
|
||||
TargetPhraseVector::iterator nth =
|
||||
(m_tableLimit == 0 || tpv->size() < m_tableLimit) ?
|
||||
tpv->end() : tpv->begin() + m_tableLimit;
|
||||
std::nth_element(tpv->begin(), nth, tpv->end(), CompareTargetPhrase());
|
||||
for(TargetPhraseVector::iterator it = tpv->begin(); it != nth; it++)
|
||||
phraseColl->Add(new TargetPhrase(*it));
|
||||
|
||||
// Cache phrase pair for for clean-up or retrieval with PREnc
|
||||
const_cast<PhraseDictionaryCompact*>(this)->CacheForCleanup(sourcePhrase, phraseColl);
|
||||
|
||||
return phraseColl;
|
||||
}
|
||||
else
|
||||
return NULL;
|
||||
|
||||
}
|
||||
|
||||
PhraseDictionaryCompact::~PhraseDictionaryCompact() {
|
||||
if(m_phraseDecoder)
|
||||
delete m_phraseDecoder;
|
||||
}
|
||||
|
||||
//TO_STRING_BODY(PhraseDictionaryCompact)
|
||||
|
||||
TargetPhraseCollection*
|
||||
PhraseDictionaryCompact::RetrieveFromCache(const Phrase &sourcePhrase) {
|
||||
#ifdef WITH_THREADS
|
||||
boost::mutex::scoped_lock lock(m_sentenceMutex);
|
||||
PhraseCache &ref = m_sentenceCache[pthread_self()];
|
||||
#else
|
||||
PhraseCache &ref = m_sentenceCache;
|
||||
#endif
|
||||
PhraseCache::iterator it = ref.find(sourcePhrase);
|
||||
if(it != ref.end())
|
||||
return it->second;
|
||||
else
|
||||
return NULL;
|
||||
}
|
||||
|
||||
void PhraseDictionaryCompact::CacheForCleanup(const Phrase &sourcePhrase,
|
||||
TargetPhraseCollection* tpc) {
|
||||
#ifdef WITH_THREADS
|
||||
boost::mutex::scoped_lock lock(m_sentenceMutex);
|
||||
m_sentenceCache[pthread_self()].insert(std::make_pair(sourcePhrase, tpc));
|
||||
#else
|
||||
m_sentenceCache.insert(std::make_pair(sourcePhrase, tpc));
|
||||
#endif
|
||||
}
|
||||
|
||||
void PhraseDictionaryCompact::InitializeForInput(const Moses::InputType&) {}
|
||||
|
||||
void PhraseDictionaryCompact::AddEquivPhrase(const Phrase &source,
|
||||
const TargetPhrase &targetPhrase) { }
|
||||
|
||||
void PhraseDictionaryCompact::CleanUp() {
|
||||
if(!m_inMemory)
|
||||
m_hash.KeepNLastRanges(0.01, 0.2);
|
||||
|
||||
m_phraseDecoder->PruneCache();
|
||||
|
||||
#ifdef WITH_THREADS
|
||||
boost::mutex::scoped_lock lock(m_sentenceMutex);
|
||||
PhraseCache &ref = m_sentenceCache[pthread_self()];
|
||||
#else
|
||||
PhraseCache &ref = m_sentenceCache;
|
||||
#endif
|
||||
|
||||
for(PhraseCache::iterator it = ref.begin(); it != ref.end(); it++)
|
||||
delete it->second;
|
||||
|
||||
PhraseCache temp;
|
||||
temp.swap(ref);
|
||||
}
|
||||
|
||||
}
|
||||
|
119
moses/src/CompactPT/PhraseDictionaryCompact.h
Normal file
119
moses/src/CompactPT/PhraseDictionaryCompact.h
Normal file
@ -0,0 +1,119 @@
|
||||
|
||||
/***********************************************************************
|
||||
Moses - factored phrase-based language decoder
|
||||
Copyright (C) 2006 University of Edinburgh
|
||||
|
||||
This library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
This library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with this library; if not, write to the Free Software
|
||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
***********************************************************************/
|
||||
|
||||
#ifndef moses_PhraseDictionaryCompact_h
|
||||
#define moses_PhraseDictionaryCompact_h
|
||||
|
||||
#include <boost/unordered_map.hpp>
|
||||
|
||||
#ifdef WITH_THREADS
|
||||
#ifdef BOOST_HAS_PTHREADS
|
||||
#include <boost/thread/mutex.hpp>
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#include "PhraseDictionary.h"
|
||||
#include "ThreadPool.h"
|
||||
|
||||
#include "BlockHashIndex.h"
|
||||
#include "StringVector.h"
|
||||
#include "PhraseDecoder.h"
|
||||
#include "TargetPhraseCollectionCache.h"
|
||||
|
||||
namespace Moses
|
||||
{
|
||||
|
||||
class PhraseDecoder;
|
||||
|
||||
class PhraseDictionaryCompact : public PhraseDictionary
|
||||
{
|
||||
protected:
|
||||
friend class PhraseDecoder;
|
||||
|
||||
PhraseTableImplementation m_implementation;
|
||||
bool m_inMemory;
|
||||
|
||||
typedef std::map<Phrase, TargetPhraseCollection*> PhraseCache;
|
||||
#ifdef WITH_THREADS
|
||||
boost::mutex m_sentenceMutex;
|
||||
typedef std::map<size_t, PhraseCache> SentenceCache;
|
||||
#else
|
||||
typedef PhraseCache SentenceCache;
|
||||
#endif
|
||||
SentenceCache m_sentenceCache;
|
||||
|
||||
BlockHashIndex m_hash;
|
||||
PhraseDecoder* m_phraseDecoder;
|
||||
|
||||
StringVector<unsigned char, size_t, MmapAllocator> m_targetPhrasesMapped;
|
||||
StringVector<unsigned char, size_t, std::allocator> m_targetPhrasesMemory;
|
||||
|
||||
const std::vector<FactorType>* m_input;
|
||||
const std::vector<FactorType>* m_output;
|
||||
|
||||
const std::vector<float>* m_weight;
|
||||
const LMList* m_languageModels;
|
||||
float m_weightWP;
|
||||
|
||||
public:
|
||||
PhraseDictionaryCompact(size_t numScoreComponent,
|
||||
PhraseTableImplementation implementation,
|
||||
PhraseDictionaryFeature* feature)
|
||||
: PhraseDictionary(numScoreComponent, feature),
|
||||
m_implementation(implementation),
|
||||
m_inMemory(StaticData::Instance().UseMinphrInMemory()),
|
||||
m_hash(10, 16),
|
||||
m_phraseDecoder(0)
|
||||
{}
|
||||
|
||||
virtual ~PhraseDictionaryCompact();
|
||||
|
||||
bool Load(const std::vector<FactorType> &input
|
||||
, const std::vector<FactorType> &output
|
||||
, const std::string &filePath
|
||||
, const std::vector<float> &weight
|
||||
, size_t tableLimit
|
||||
, const LMList &languageModels
|
||||
, float weightWP);
|
||||
|
||||
const TargetPhraseCollection* GetTargetPhraseCollection(const Phrase &source) const;
|
||||
|
||||
void AddEquivPhrase(const Phrase &source, const TargetPhrase &targetPhrase);
|
||||
|
||||
void InitializeForInput(const Moses::InputType&);
|
||||
|
||||
TargetPhraseCollection* RetrieveFromCache(const Phrase &sourcePhrase);
|
||||
void CacheForCleanup(const Phrase &source, TargetPhraseCollection* tpc);
|
||||
void CleanUp();
|
||||
|
||||
virtual ChartRuleLookupManager *CreateRuleLookupManager(
|
||||
const InputType &,
|
||||
const ChartCellCollection &)
|
||||
{
|
||||
assert(false);
|
||||
return 0;
|
||||
}
|
||||
|
||||
TO_STRING();
|
||||
|
||||
};
|
||||
|
||||
}
|
||||
#endif
|
1195
moses/src/CompactPT/PhraseTableCreator.cpp
Normal file
1195
moses/src/CompactPT/PhraseTableCreator.cpp
Normal file
File diff suppressed because it is too large
Load Diff
401
moses/src/CompactPT/PhraseTableCreator.h
Normal file
401
moses/src/CompactPT/PhraseTableCreator.h
Normal file
@ -0,0 +1,401 @@
|
||||
#ifndef moses_PhraseTableCreator_h
|
||||
#define moses_PhraseTableCreator_h
|
||||
|
||||
#include <sstream>
|
||||
#include <iostream>
|
||||
#include <queue>
|
||||
#include <vector>
|
||||
#include <boost/unordered_map.hpp>
|
||||
|
||||
#include "InputFileStream.h"
|
||||
#include "ThreadPool.h"
|
||||
#include "UserMessage.h"
|
||||
#include "Util.h"
|
||||
|
||||
#include "CompactPT/BlockHashIndex.h"
|
||||
#include "CompactPT/ConsistantPhrases.h"
|
||||
#include "CompactPT/StringVector.h"
|
||||
#include "CompactPT/CanonicalHuffman.h"
|
||||
|
||||
namespace Moses
|
||||
{
|
||||
|
||||
typedef std::pair<unsigned char, unsigned char> AlignPoint;
|
||||
|
||||
template <typename DataType>
|
||||
class Counter
|
||||
{
|
||||
public:
|
||||
typedef boost::unordered_map<DataType, size_t> FreqMap;
|
||||
typedef typename FreqMap::iterator iterator;
|
||||
typedef typename FreqMap::mapped_type mapped_type;
|
||||
typedef typename FreqMap::value_type value_type;
|
||||
|
||||
private:
|
||||
#ifdef WITH_THREADS
|
||||
boost::mutex m_mutex;
|
||||
#endif
|
||||
FreqMap m_freqMap;
|
||||
size_t m_maxSize;
|
||||
std::vector<DataType> m_bestVec;
|
||||
|
||||
struct FreqSorter
|
||||
{
|
||||
bool operator()(const value_type& a, const value_type& b) const
|
||||
{
|
||||
if(a.second > b.second)
|
||||
return true;
|
||||
// Check impact on translation quality!
|
||||
if(a.second == b.second && a.first > b.first)
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
};
|
||||
|
||||
public:
|
||||
Counter() : m_maxSize(0) {}
|
||||
|
||||
iterator Begin()
|
||||
{
|
||||
return m_freqMap.begin();
|
||||
}
|
||||
|
||||
iterator End()
|
||||
{
|
||||
return m_freqMap.end();
|
||||
}
|
||||
|
||||
void Increase(DataType data)
|
||||
{
|
||||
#ifdef WITH_THREADS
|
||||
boost::mutex::scoped_lock lock(m_mutex);
|
||||
#endif
|
||||
m_freqMap[data]++;
|
||||
}
|
||||
|
||||
void IncreaseBy(DataType data, size_t num)
|
||||
{
|
||||
#ifdef WITH_THREADS
|
||||
boost::mutex::scoped_lock lock(m_mutex);
|
||||
#endif
|
||||
m_freqMap[data] += num;
|
||||
}
|
||||
|
||||
mapped_type& operator[](DataType data)
|
||||
{
|
||||
return m_freqMap[data];
|
||||
}
|
||||
|
||||
size_t Size()
|
||||
{
|
||||
#ifdef WITH_THREADS
|
||||
boost::mutex::scoped_lock lock(m_mutex);
|
||||
#endif
|
||||
return m_freqMap.size();
|
||||
}
|
||||
|
||||
void Quantize(size_t maxSize)
|
||||
{
|
||||
#ifdef WITH_THREADS
|
||||
boost::mutex::scoped_lock lock(m_mutex);
|
||||
#endif
|
||||
m_maxSize = maxSize;
|
||||
std::vector<std::pair<DataType, mapped_type> > freqVec;
|
||||
freqVec.insert(freqVec.begin(), m_freqMap.begin(), m_freqMap.end());
|
||||
std::sort(freqVec.begin(), freqVec.end(), FreqSorter());
|
||||
|
||||
for(size_t i = 0; i < freqVec.size() && i < m_maxSize; i++)
|
||||
m_bestVec.push_back(freqVec[i].first);
|
||||
|
||||
std::sort(m_bestVec.begin(), m_bestVec.end());
|
||||
|
||||
FreqMap t_freqMap;
|
||||
for(typename std::vector<std::pair<DataType, mapped_type> >::iterator it
|
||||
= freqVec.begin(); it != freqVec.end(); it++)
|
||||
{
|
||||
DataType closest = LowerBound(it->first);
|
||||
t_freqMap[closest] += it->second;
|
||||
}
|
||||
|
||||
m_freqMap.swap(t_freqMap);
|
||||
}
|
||||
|
||||
void Clear()
|
||||
{
|
||||
#ifdef WITH_THREADS
|
||||
boost::mutex::scoped_lock lock(m_mutex);
|
||||
#endif
|
||||
m_freqMap.clear();
|
||||
}
|
||||
|
||||
DataType LowerBound(DataType data)
|
||||
{
|
||||
if(m_maxSize == 0 || m_bestVec.size() == 0)
|
||||
return data;
|
||||
else
|
||||
{
|
||||
typename std::vector<DataType>::iterator it
|
||||
= std::lower_bound(m_bestVec.begin(), m_bestVec.end(), data);
|
||||
if(it != m_bestVec.end())
|
||||
return *it;
|
||||
else
|
||||
return m_bestVec.back();
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
class PackedItem
|
||||
{
|
||||
private:
|
||||
long m_line;
|
||||
std::string m_sourcePhrase;
|
||||
std::string m_packedTargetPhrase;
|
||||
size_t m_rank;
|
||||
float m_score;
|
||||
|
||||
public:
|
||||
PackedItem(long line, std::string sourcePhrase,
|
||||
std::string packedTargetPhrase, size_t rank,
|
||||
float m_score = 0);
|
||||
|
||||
long GetLine() const;
|
||||
const std::string& GetSrc() const;
|
||||
const std::string& GetTrg() const;
|
||||
size_t GetRank() const;
|
||||
float GetScore() const;
|
||||
};
|
||||
|
||||
static bool operator<(const PackedItem &pi1, const PackedItem &pi2) {
|
||||
if(pi1.GetLine() < pi2.GetLine())
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
class PhraseTableCreator
|
||||
{
|
||||
public:
|
||||
enum Coding { None, REnc, PREnc };
|
||||
|
||||
private:
|
||||
std::string m_inPath;
|
||||
std::string m_outPath;
|
||||
|
||||
std::FILE* m_outFile;
|
||||
|
||||
size_t m_numScoreComponent;
|
||||
Coding m_coding;
|
||||
size_t m_orderBits;
|
||||
size_t m_fingerPrintBits;
|
||||
bool m_useAlignmentInfo;
|
||||
bool m_multipleScoreTrees;
|
||||
size_t m_quantize;
|
||||
size_t m_maxRank;
|
||||
|
||||
static std::string m_phraseStopSymbol;
|
||||
static std::string m_separator;
|
||||
|
||||
#ifdef WITH_THREADS
|
||||
size_t m_threads;
|
||||
boost::mutex m_mutex;
|
||||
#endif
|
||||
|
||||
BlockHashIndex m_srcHash;
|
||||
BlockHashIndex m_rnkHash;
|
||||
|
||||
size_t m_maxPhraseLength;
|
||||
|
||||
std::vector<unsigned> m_ranks;
|
||||
|
||||
typedef std::pair<unsigned, unsigned> SrcTrg;
|
||||
typedef std::pair<std::string, std::string> SrcTrgString;
|
||||
typedef std::pair<SrcTrgString, float> SrcTrgProb;
|
||||
|
||||
struct SrcTrgProbSorter
|
||||
{
|
||||
bool operator()(const SrcTrgProb& a, const SrcTrgProb& b) const
|
||||
{
|
||||
if(a.first.first < b.first.first)
|
||||
return true;
|
||||
|
||||
if(a.first.first == b.first.first && a.second > b.second)
|
||||
return true;
|
||||
|
||||
if(a.first.first == b.first.first
|
||||
&& a.second == b.second
|
||||
&& a.first.second < b.first.second)
|
||||
return true;
|
||||
|
||||
return false;
|
||||
}
|
||||
};
|
||||
|
||||
std::vector<size_t> m_lexicalTableIndex;
|
||||
std::vector<SrcTrg> m_lexicalTable;
|
||||
|
||||
StringVector<unsigned char, unsigned long, MmapAllocator>
|
||||
m_encodedTargetPhrases;
|
||||
|
||||
StringVector<unsigned char, unsigned long, MmapAllocator>
|
||||
m_compressedTargetPhrases;
|
||||
|
||||
boost::unordered_map<std::string, unsigned> m_targetSymbolsMap;
|
||||
boost::unordered_map<std::string, unsigned> m_sourceSymbolsMap;
|
||||
|
||||
typedef Counter<unsigned> SymbolCounter;
|
||||
typedef Counter<float> ScoreCounter;
|
||||
typedef Counter<AlignPoint> AlignCounter;
|
||||
|
||||
typedef CanonicalHuffman<unsigned> SymbolTree;
|
||||
typedef CanonicalHuffman<float> ScoreTree;
|
||||
typedef CanonicalHuffman<AlignPoint> AlignTree;
|
||||
|
||||
SymbolCounter m_symbolCounter;
|
||||
SymbolTree* m_symbolTree;
|
||||
|
||||
AlignCounter m_alignCounter;
|
||||
AlignTree* m_alignTree;
|
||||
|
||||
std::vector<ScoreCounter*> m_scoreCounters;
|
||||
std::vector<ScoreTree*> m_scoreTrees;
|
||||
|
||||
std::priority_queue<PackedItem> m_queue;
|
||||
long m_lastFlushedLine;
|
||||
long m_lastFlushedSourceNum;
|
||||
std::string m_lastFlushedSourcePhrase;
|
||||
std::vector<std::string> m_lastSourceRange;
|
||||
std::priority_queue<std::pair<float, size_t> > m_rankQueue;
|
||||
std::vector<std::string> m_lastCollection;
|
||||
|
||||
void Save();
|
||||
void PrintInfo();
|
||||
|
||||
void AddSourceSymbolId(std::string& symbol);
|
||||
unsigned GetSourceSymbolId(std::string& symbol);
|
||||
|
||||
void AddTargetSymbolId(std::string& symbol);
|
||||
unsigned GetTargetSymbolId(std::string& symbol);
|
||||
unsigned GetOrAddTargetSymbolId(std::string& symbol);
|
||||
|
||||
unsigned GetRank(unsigned srcIdx, unsigned trgIdx);
|
||||
|
||||
unsigned EncodeREncSymbol1(unsigned symbol);
|
||||
unsigned EncodeREncSymbol2(unsigned position, unsigned rank);
|
||||
unsigned EncodeREncSymbol3(unsigned rank);
|
||||
|
||||
unsigned EncodePREncSymbol1(unsigned symbol);
|
||||
unsigned EncodePREncSymbol2(int lOff, int rOff, unsigned rank);
|
||||
|
||||
void EncodeTargetPhraseNone(std::vector<std::string>& t,
|
||||
std::ostream& os);
|
||||
|
||||
void EncodeTargetPhraseREnc(std::vector<std::string>& s,
|
||||
std::vector<std::string>& t,
|
||||
std::set<AlignPoint>& a,
|
||||
std::ostream& os);
|
||||
|
||||
void EncodeTargetPhrasePREnc(std::vector<std::string>& s,
|
||||
std::vector<std::string>& t,
|
||||
std::set<AlignPoint>& a, size_t ownRank,
|
||||
std::ostream& os);
|
||||
|
||||
void EncodeScores(std::vector<float>& scores, std::ostream& os);
|
||||
void EncodeAlignment(std::set<AlignPoint>& alignment, std::ostream& os);
|
||||
|
||||
std::string MakeSourceKey(std::string&);
|
||||
std::string MakeSourceTargetKey(std::string&, std::string&);
|
||||
|
||||
void LoadLexicalTable(std::string filePath);
|
||||
|
||||
void CreateRankHash();
|
||||
void EncodeTargetPhrases();
|
||||
void CalcHuffmanCodes();
|
||||
void CompressTargetPhrases();
|
||||
|
||||
void AddRankedLine(PackedItem& pi);
|
||||
void FlushRankedQueue(bool force = false);
|
||||
|
||||
std::string EncodeLine(std::vector<std::string>& tokens, size_t ownRank);
|
||||
void AddEncodedLine(PackedItem& pi);
|
||||
void FlushEncodedQueue(bool force = false);
|
||||
|
||||
std::string CompressEncodedCollection(std::string encodedCollection);
|
||||
void AddCompressedCollection(PackedItem& pi);
|
||||
void FlushCompressedQueue(bool force = false);
|
||||
|
||||
public:
|
||||
|
||||
PhraseTableCreator(std::string inPath,
|
||||
std::string outPath,
|
||||
size_t numScoreComponent = 5,
|
||||
Coding coding = PREnc,
|
||||
size_t orderBits = 10,
|
||||
size_t fingerPrintBits = 16,
|
||||
bool useAlignmentInfo = false,
|
||||
bool multipleScoreTrees = true,
|
||||
size_t quantize = 0,
|
||||
size_t maxRank = 100
|
||||
#ifdef WITH_THREADS
|
||||
, size_t threads = 2
|
||||
#endif
|
||||
);
|
||||
|
||||
friend class RankingTask;
|
||||
friend class EncodingTask;
|
||||
friend class CompressionTask;
|
||||
};
|
||||
|
||||
class RankingTask
|
||||
{
|
||||
private:
|
||||
#ifdef WITH_THREADS
|
||||
static boost::mutex m_mutex;
|
||||
static boost::mutex m_fileMutex;
|
||||
#endif
|
||||
static size_t m_lineNum;
|
||||
InputFileStream& m_inFile;
|
||||
PhraseTableCreator& m_creator;
|
||||
|
||||
public:
|
||||
RankingTask(InputFileStream& inFile, PhraseTableCreator& creator);
|
||||
void operator()();
|
||||
};
|
||||
|
||||
class EncodingTask
|
||||
{
|
||||
private:
|
||||
#ifdef WITH_THREADS
|
||||
static boost::mutex m_mutex;
|
||||
static boost::mutex m_fileMutex;
|
||||
#endif
|
||||
static size_t m_lineNum;
|
||||
static size_t m_sourcePhraseNum;
|
||||
static std::string m_lastSourcePhrase;
|
||||
|
||||
InputFileStream& m_inFile;
|
||||
PhraseTableCreator& m_creator;
|
||||
|
||||
public:
|
||||
EncodingTask(InputFileStream& inFile, PhraseTableCreator& creator);
|
||||
void operator()();
|
||||
};
|
||||
|
||||
class CompressionTask
|
||||
{
|
||||
private:
|
||||
#ifdef WITH_THREADS
|
||||
static boost::mutex m_mutex;
|
||||
#endif
|
||||
static size_t m_collectionNum;
|
||||
StringVector<unsigned char, unsigned long, MmapAllocator>&
|
||||
m_encodedCollections;
|
||||
PhraseTableCreator& m_creator;
|
||||
|
||||
public:
|
||||
CompressionTask(StringVector<unsigned char, unsigned long, MmapAllocator>&
|
||||
encodedCollections, PhraseTableCreator& creator);
|
||||
void operator()();
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
#endif
|
600
moses/src/CompactPT/StringVector.h
Normal file
600
moses/src/CompactPT/StringVector.h
Normal file
@ -0,0 +1,600 @@
|
||||
#ifndef moses_StringVector_h
|
||||
#define moses_StringVector_h
|
||||
|
||||
#include <vector>
|
||||
#include <algorithm>
|
||||
#include <string>
|
||||
#include <iterator>
|
||||
#include <cstdio>
|
||||
#include <cassert>
|
||||
|
||||
#include <boost/iterator/iterator_facade.hpp>
|
||||
|
||||
#include "MonotonicVector.h"
|
||||
#include "MmapAllocator.h"
|
||||
|
||||
namespace Moses
|
||||
{
|
||||
|
||||
// ********** ValueIteratorRange **********
|
||||
|
||||
template <typename ValueIteratorT>
|
||||
class ValueIteratorRange
|
||||
{
|
||||
private:
|
||||
ValueIteratorT m_begin;
|
||||
ValueIteratorT m_end;
|
||||
|
||||
public:
|
||||
ValueIteratorRange(ValueIteratorT begin, ValueIteratorT end);
|
||||
|
||||
const ValueIteratorT& begin() const;
|
||||
const ValueIteratorT& end() const;
|
||||
const std::string str() const;
|
||||
operator const std::string()
|
||||
{
|
||||
return str();
|
||||
}
|
||||
|
||||
size_t size()
|
||||
{
|
||||
return std::distance(m_begin, m_end);
|
||||
}
|
||||
|
||||
template <typename StringT>
|
||||
bool operator==(const StringT& o) const;
|
||||
bool operator==(const char* c) const;
|
||||
|
||||
template <typename StringT>
|
||||
bool operator<(const StringT& o) const;
|
||||
bool operator<(const char* c) const;
|
||||
};
|
||||
|
||||
// ********** StringVector **********
|
||||
|
||||
template <typename ValueT = unsigned char, typename PosT = unsigned int,
|
||||
template <typename> class Allocator = std::allocator>
|
||||
class StringVector
|
||||
{
|
||||
protected:
|
||||
std::vector<ValueT, Allocator<ValueT> > m_charArray;
|
||||
MonotonicVector<PosT, unsigned int, 32, Allocator> m_positions;
|
||||
bool m_sorted;
|
||||
bool m_memoryMapped;
|
||||
|
||||
virtual const ValueT* value_ptr(PosT i) const;
|
||||
|
||||
public:
|
||||
typedef ValueIteratorRange<typename std::vector<ValueT, Allocator<ValueT> >::const_iterator> range;
|
||||
|
||||
// ********** RangeIterator **********
|
||||
|
||||
class RangeIterator : public boost::iterator_facade<RangeIterator,
|
||||
range, std::random_access_iterator_tag, range, PosT>
|
||||
{
|
||||
|
||||
private:
|
||||
PosT m_index;
|
||||
StringVector<ValueT, PosT, Allocator>* m_container;
|
||||
|
||||
public:
|
||||
RangeIterator();
|
||||
RangeIterator(StringVector<ValueT, PosT, Allocator> &sv, PosT index=0);
|
||||
|
||||
PosT get_index();
|
||||
|
||||
private:
|
||||
friend class boost::iterator_core_access;
|
||||
|
||||
range dereference() const;
|
||||
bool equal(RangeIterator const& other) const;
|
||||
void increment();
|
||||
void decrement();
|
||||
void advance(PosT n);
|
||||
|
||||
PosT distance_to(RangeIterator const& other) const;
|
||||
};
|
||||
|
||||
// ********** StringIterator **********
|
||||
|
||||
class StringIterator : public boost::iterator_facade<StringIterator,
|
||||
std::string, std::random_access_iterator_tag, const std::string, PosT>
|
||||
{
|
||||
|
||||
private:
|
||||
PosT m_index;
|
||||
StringVector<ValueT, PosT, Allocator>* m_container;
|
||||
|
||||
public:
|
||||
StringIterator();
|
||||
StringIterator(StringVector<ValueT, PosT, Allocator> &sv, PosT index=0);
|
||||
|
||||
PosT get_index();
|
||||
|
||||
private:
|
||||
friend class boost::iterator_core_access;
|
||||
|
||||
const std::string dereference() const;
|
||||
bool equal(StringIterator const& other) const;
|
||||
void increment();
|
||||
void decrement();
|
||||
void advance(PosT n);
|
||||
PosT distance_to(StringIterator const& other) const;
|
||||
};
|
||||
|
||||
typedef RangeIterator iterator;
|
||||
typedef StringIterator string_iterator;
|
||||
|
||||
StringVector();
|
||||
|
||||
void swap(StringVector<ValueT, PosT, Allocator> &c)
|
||||
{
|
||||
m_positions.commit();
|
||||
m_positions.swap(c.m_positions);
|
||||
m_charArray.swap(c.m_charArray);
|
||||
|
||||
bool temp = m_sorted;
|
||||
m_sorted = c.m_sorted;
|
||||
c.m_sorted = temp;
|
||||
}
|
||||
|
||||
bool is_sorted() const;
|
||||
PosT size() const;
|
||||
virtual PosT size2() const;
|
||||
|
||||
template<class Iterator> Iterator begin() const;
|
||||
template<class Iterator> Iterator end() const;
|
||||
|
||||
iterator begin() const;
|
||||
iterator end() const;
|
||||
|
||||
PosT length(PosT i) const;
|
||||
typename std::vector<ValueT, Allocator<ValueT> >::const_iterator begin(PosT i) const;
|
||||
typename std::vector<ValueT, Allocator<ValueT> >::const_iterator end(PosT i) const;
|
||||
|
||||
void clear()
|
||||
{
|
||||
m_charArray.clear();
|
||||
m_sorted = true;
|
||||
m_positions = MonotonicVector<PosT, unsigned int, 32>();
|
||||
}
|
||||
|
||||
range at(PosT i) const;
|
||||
range operator[](PosT i) const;
|
||||
range back() const;
|
||||
|
||||
template <typename StringT>
|
||||
void push_back(StringT s);
|
||||
void push_back(const char* c);
|
||||
|
||||
template <typename StringT>
|
||||
PosT find(StringT &s) const;
|
||||
PosT find(const char* c) const;
|
||||
|
||||
virtual size_t load(std::FILE* in, bool memoryMapped = false)
|
||||
{
|
||||
size_t size = 0;
|
||||
m_memoryMapped = memoryMapped;
|
||||
|
||||
size += std::fread(&m_sorted, sizeof(bool), 1, in) * sizeof(bool);
|
||||
size += m_positions.load(in, m_memoryMapped);
|
||||
|
||||
size += loadCharArray(m_charArray, in, m_memoryMapped);
|
||||
return size;
|
||||
}
|
||||
|
||||
size_t loadCharArray(std::vector<ValueT, std::allocator<ValueT> >& c,
|
||||
std::FILE* in, bool map = false)
|
||||
{
|
||||
// Can only be read into memory. Mapping not possible with std:allocator.
|
||||
assert(map == false);
|
||||
|
||||
size_t byteSize = 0;
|
||||
|
||||
size_t valSize;
|
||||
byteSize += std::fread(&valSize, sizeof(size_t), 1, in) * sizeof(size_t);
|
||||
|
||||
c.resize(valSize, 0);
|
||||
byteSize += std::fread(&c[0], sizeof(ValueT), valSize, in) * sizeof(ValueT);
|
||||
|
||||
return byteSize;
|
||||
}
|
||||
|
||||
size_t loadCharArray(std::vector<ValueT, MmapAllocator<ValueT> >& c,
|
||||
std::FILE* in, bool map = false)
|
||||
{
|
||||
size_t byteSize = 0;
|
||||
|
||||
size_t valSize;
|
||||
byteSize += std::fread(&valSize, sizeof(size_t), 1, in) * sizeof(size_t);
|
||||
|
||||
if(map == false)
|
||||
{
|
||||
// Read data into temporary file (default constructor of MmapAllocator)
|
||||
// and map memory onto temporary file. Can be resized.
|
||||
|
||||
c.resize(valSize, 0);
|
||||
byteSize += std::fread(&c[0], sizeof(ValueT), valSize, in) * sizeof(ValueT);
|
||||
}
|
||||
else
|
||||
{
|
||||
// Map it directly on specified region of file "in" starting at valPos
|
||||
// with length valSize * sizeof(ValueT). Mapped region cannot be resized.
|
||||
|
||||
size_t valPos = std::ftell(in);
|
||||
Allocator<ValueT> alloc(in, valPos);
|
||||
std::vector<ValueT, Allocator<ValueT> > charArrayTemp(alloc);
|
||||
charArrayTemp.resize(valSize);
|
||||
c.swap(charArrayTemp);
|
||||
|
||||
byteSize += valSize * sizeof(ValueT);
|
||||
}
|
||||
|
||||
return byteSize;
|
||||
}
|
||||
|
||||
size_t load(std::string filename, bool memoryMapped = false)
|
||||
{
|
||||
std::FILE* pFile = fopen(filename.c_str(), "r");
|
||||
size_t byteSize = load(pFile, memoryMapped);
|
||||
fclose(pFile);
|
||||
return byteSize;
|
||||
}
|
||||
|
||||
size_t save(std::FILE* out)
|
||||
{
|
||||
size_t byteSize = 0;
|
||||
byteSize += std::fwrite(&m_sorted, sizeof(bool), 1, out) * sizeof(bool);
|
||||
|
||||
byteSize += m_positions.save(out);
|
||||
|
||||
size_t valSize = size2();
|
||||
byteSize += std::fwrite(&valSize, sizeof(size_t), 1, out) * sizeof(size_t);
|
||||
byteSize += std::fwrite(&m_charArray[0], sizeof(ValueT), valSize, out) * sizeof(ValueT);
|
||||
|
||||
return byteSize;
|
||||
}
|
||||
|
||||
size_t save(std::string filename)
|
||||
{
|
||||
std::FILE* pFile = fopen(filename.c_str(), "w");
|
||||
size_t byteSize = save(pFile);
|
||||
fclose(pFile);
|
||||
return byteSize;
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
// ********** Implementation **********
|
||||
|
||||
// ValueIteratorRange
|
||||
|
||||
template <typename ValueIteratorT>
|
||||
ValueIteratorRange<ValueIteratorT>::ValueIteratorRange(ValueIteratorT begin,
|
||||
ValueIteratorT end) : m_begin(begin), m_end(end) { }
|
||||
|
||||
template <typename ValueIteratorT>
|
||||
const ValueIteratorT& ValueIteratorRange<ValueIteratorT>::begin() const
|
||||
{
|
||||
return m_begin;
|
||||
}
|
||||
|
||||
template <typename ValueIteratorT>
|
||||
const ValueIteratorT& ValueIteratorRange<ValueIteratorT>::end() const
|
||||
{
|
||||
return m_end;
|
||||
}
|
||||
|
||||
template <typename ValueIteratorT>
|
||||
const std::string ValueIteratorRange<ValueIteratorT>::str() const
|
||||
{
|
||||
std::string dummy;
|
||||
for(ValueIteratorT it = m_begin; it != m_end; it++)
|
||||
dummy.push_back(*it);
|
||||
return dummy;
|
||||
}
|
||||
|
||||
template <typename ValueIteratorT>
|
||||
template <typename StringT>
|
||||
bool ValueIteratorRange<ValueIteratorT>::operator==(const StringT& o) const
|
||||
{
|
||||
if(std::distance(m_begin, m_end) == std::distance(o.begin(), o.end()))
|
||||
return std::equal(m_begin, m_end, o.begin());
|
||||
else
|
||||
return false;
|
||||
}
|
||||
|
||||
template <typename ValueIteratorT>
|
||||
bool ValueIteratorRange<ValueIteratorT>::operator==(const char* c) const
|
||||
{
|
||||
return *this == std::string(c);
|
||||
}
|
||||
|
||||
template <typename ValueIteratorT>
|
||||
template <typename StringT>
|
||||
bool ValueIteratorRange<ValueIteratorT>::operator<(const StringT &s2) const
|
||||
{
|
||||
return std::lexicographical_compare(m_begin, m_end, s2.begin(), s2.end(),
|
||||
std::less<typename std::iterator_traits<ValueIteratorT>::value_type>());
|
||||
}
|
||||
|
||||
template <typename ValueIteratorT>
|
||||
bool ValueIteratorRange<ValueIteratorT>::operator<(const char* c) const
|
||||
{
|
||||
return *this < std::string(c);
|
||||
}
|
||||
|
||||
template <typename StringT, typename ValueIteratorT>
|
||||
bool operator<(const StringT &s1, const ValueIteratorRange<ValueIteratorT> &s2)
|
||||
{
|
||||
return std::lexicographical_compare(s1.begin(), s1.end(), s2.begin(), s2.end(),
|
||||
std::less<typename std::iterator_traits<ValueIteratorT>::value_type>());
|
||||
}
|
||||
|
||||
template <typename ValueIteratorT>
|
||||
bool operator<(const char* c, const ValueIteratorRange<ValueIteratorT> &s2)
|
||||
{
|
||||
size_t len = std::char_traits<char>::length(c);
|
||||
return std::lexicographical_compare(c, c + len, s2.begin(), s2.end(),
|
||||
std::less<typename std::iterator_traits<ValueIteratorT>::value_type>());
|
||||
}
|
||||
|
||||
template <typename OStream, typename ValueIteratorT>
|
||||
OStream& operator<<(OStream &os, ValueIteratorRange<ValueIteratorT> cr)
|
||||
{
|
||||
ValueIteratorT it = cr.begin();
|
||||
while(it != cr.end())
|
||||
os << *(it++);
|
||||
return os;
|
||||
}
|
||||
|
||||
// StringVector
|
||||
|
||||
template<typename ValueT, typename PosT, template <typename> class Allocator>
|
||||
StringVector<ValueT, PosT, Allocator>::StringVector()
|
||||
: m_sorted(true), m_memoryMapped(false) { }
|
||||
|
||||
template<typename ValueT, typename PosT, template <typename> class Allocator>
|
||||
template <typename StringT>
|
||||
void StringVector<ValueT, PosT, Allocator>::push_back(StringT s)
|
||||
{
|
||||
if(is_sorted() && size() && !(back() < s))
|
||||
m_sorted = false;
|
||||
|
||||
m_positions.push_back(size2());
|
||||
std::copy(s.begin(), s.end(), std::back_inserter(m_charArray));
|
||||
}
|
||||
|
||||
template<typename ValueT, typename PosT, template <typename> class Allocator>
|
||||
void StringVector<ValueT, PosT, Allocator>::push_back(const char* c)
|
||||
{
|
||||
std::string dummy(c);
|
||||
push_back(dummy);
|
||||
}
|
||||
|
||||
template<typename ValueT, typename PosT, template <typename> class Allocator>
|
||||
template <typename Iterator>
|
||||
Iterator StringVector<ValueT, PosT, Allocator>::begin() const
|
||||
{
|
||||
return Iterator(const_cast<StringVector<ValueT, PosT, Allocator>&>(*this), 0);
|
||||
}
|
||||
|
||||
template<typename ValueT, typename PosT, template <typename> class Allocator>
|
||||
template <typename Iterator>
|
||||
Iterator StringVector<ValueT, PosT, Allocator>::end() const
|
||||
{
|
||||
return Iterator(const_cast<StringVector<ValueT, PosT, Allocator>&>(*this), size());
|
||||
}
|
||||
|
||||
template<typename ValueT, typename PosT, template <typename> class Allocator>
|
||||
typename StringVector<ValueT, PosT, Allocator>::iterator StringVector<ValueT, PosT, Allocator>::begin() const
|
||||
{
|
||||
return begin<iterator>();
|
||||
};
|
||||
|
||||
template<typename ValueT, typename PosT, template <typename> class Allocator>
|
||||
typename StringVector<ValueT, PosT, Allocator>::iterator StringVector<ValueT, PosT, Allocator>::end() const
|
||||
{
|
||||
return end<iterator>();
|
||||
};
|
||||
|
||||
template<typename ValueT, typename PosT, template <typename> class Allocator>
|
||||
bool StringVector<ValueT, PosT, Allocator>::is_sorted() const
|
||||
{
|
||||
return m_sorted;
|
||||
}
|
||||
|
||||
template<typename ValueT, typename PosT, template <typename> class Allocator>
|
||||
PosT StringVector<ValueT, PosT, Allocator>::size() const
|
||||
{
|
||||
return m_positions.size();
|
||||
}
|
||||
|
||||
template<typename ValueT, typename PosT, template <typename> class Allocator>
|
||||
PosT StringVector<ValueT, PosT, Allocator>::size2() const
|
||||
{
|
||||
return m_charArray.size();
|
||||
}
|
||||
|
||||
template<typename ValueT, typename PosT, template <typename> class Allocator>
|
||||
typename StringVector<ValueT, PosT, Allocator>::range StringVector<ValueT, PosT, Allocator>::at(PosT i) const
|
||||
{
|
||||
return range(begin(i), end(i));
|
||||
}
|
||||
|
||||
template<typename ValueT, typename PosT, template <typename> class Allocator>
|
||||
typename StringVector<ValueT, PosT, Allocator>::range StringVector<ValueT, PosT, Allocator>::operator[](PosT i) const
|
||||
{
|
||||
return at(i);
|
||||
}
|
||||
|
||||
template<typename ValueT, typename PosT, template <typename> class Allocator>
|
||||
typename StringVector<ValueT, PosT, Allocator>::range StringVector<ValueT, PosT, Allocator>::back() const
|
||||
{
|
||||
return at(size()-1);
|
||||
}
|
||||
|
||||
template<typename ValueT, typename PosT, template <typename> class Allocator>
|
||||
PosT StringVector<ValueT, PosT, Allocator>::length(PosT i) const
|
||||
{
|
||||
if(i+1 < size())
|
||||
return m_positions[i+1] - m_positions[i];
|
||||
else
|
||||
return size2() - m_positions[i];
|
||||
}
|
||||
|
||||
template<typename ValueT, typename PosT, template <typename> class Allocator>
|
||||
const ValueT* StringVector<ValueT, PosT, Allocator>::value_ptr(PosT i) const
|
||||
{
|
||||
return &m_charArray[m_positions[i]];
|
||||
}
|
||||
|
||||
template<typename ValueT, typename PosT, template <typename> class Allocator>
|
||||
typename std::vector<ValueT, Allocator<ValueT> >::const_iterator StringVector<ValueT, PosT, Allocator>::begin(PosT i) const
|
||||
{
|
||||
return typename std::vector<ValueT, Allocator<ValueT> >::const_iterator(value_ptr(i));
|
||||
}
|
||||
|
||||
template<typename ValueT, typename PosT, template <typename> class Allocator>
|
||||
typename std::vector<ValueT, Allocator<ValueT> >::const_iterator StringVector<ValueT, PosT, Allocator>::end(PosT i) const
|
||||
{
|
||||
return typename std::vector<ValueT, Allocator<ValueT> >::const_iterator(value_ptr(i) + length(i));
|
||||
}
|
||||
|
||||
template<typename ValueT, typename PosT, template <typename> class Allocator>
|
||||
template <typename StringT>
|
||||
PosT StringVector<ValueT, PosT, Allocator>::find(StringT &s) const
|
||||
{
|
||||
if(m_sorted)
|
||||
return std::distance(begin(), std::lower_bound(begin(), end(), s));
|
||||
return std::distance(begin(), std::find(begin(), end(), s));
|
||||
}
|
||||
|
||||
template<typename ValueT, typename PosT, template <typename> class Allocator>
|
||||
PosT StringVector<ValueT, PosT, Allocator>::find(const char* c) const
|
||||
{
|
||||
std::string s(c);
|
||||
return find(s);
|
||||
}
|
||||
|
||||
// RangeIterator
|
||||
|
||||
template<typename ValueT, typename PosT, template <typename> class Allocator>
|
||||
StringVector<ValueT, PosT, Allocator>::RangeIterator::RangeIterator() : m_index(0), m_container(0) { }
|
||||
|
||||
template<typename ValueT, typename PosT, template <typename> class Allocator>
|
||||
StringVector<ValueT, PosT, Allocator>::RangeIterator::RangeIterator(StringVector<ValueT, PosT, Allocator> &sv, PosT index)
|
||||
: m_index(index), m_container(&sv) { }
|
||||
|
||||
template<typename ValueT, typename PosT, template <typename> class Allocator>
|
||||
PosT StringVector<ValueT, PosT, Allocator>::RangeIterator::get_index()
|
||||
{
|
||||
return m_index;
|
||||
}
|
||||
|
||||
template<typename ValueT, typename PosT, template <typename> class Allocator>
|
||||
typename StringVector<ValueT, PosT, Allocator>::range
|
||||
StringVector<ValueT, PosT, Allocator>::RangeIterator::dereference() const
|
||||
{
|
||||
return typename StringVector<ValueT, PosT, Allocator>::range(
|
||||
m_container->begin(m_index),
|
||||
m_container->end(m_index)
|
||||
);
|
||||
}
|
||||
|
||||
template<typename ValueT, typename PosT, template <typename> class Allocator>
|
||||
bool StringVector<ValueT, PosT, Allocator>::RangeIterator::equal(
|
||||
StringVector<ValueT, PosT, Allocator>::RangeIterator const& other) const
|
||||
{
|
||||
return m_index == other.m_index && m_container == other.m_container;
|
||||
}
|
||||
|
||||
template<typename ValueT, typename PosT, template <typename> class Allocator>
|
||||
void StringVector<ValueT, PosT, Allocator>::RangeIterator::increment()
|
||||
{
|
||||
m_index++;
|
||||
}
|
||||
|
||||
template<typename ValueT, typename PosT, template <typename> class Allocator>
|
||||
void StringVector<ValueT, PosT, Allocator>::RangeIterator::decrement()
|
||||
{
|
||||
m_index--;
|
||||
}
|
||||
|
||||
template<typename ValueT, typename PosT, template <typename> class Allocator>
|
||||
void StringVector<ValueT, PosT, Allocator>::RangeIterator::advance(PosT n)
|
||||
{
|
||||
m_index += n;
|
||||
}
|
||||
|
||||
template<typename ValueT, typename PosT, template <typename> class Allocator>
|
||||
PosT StringVector<ValueT, PosT, Allocator>::RangeIterator::distance_to(
|
||||
StringVector<ValueT, PosT, Allocator>::RangeIterator const& other) const
|
||||
{
|
||||
return other.m_index - m_index;
|
||||
}
|
||||
|
||||
// StringIterator
|
||||
|
||||
template<typename ValueT, typename PosT, template <typename> class Allocator>
|
||||
StringVector<ValueT, PosT, Allocator>::StringIterator::StringIterator()
|
||||
: m_index(0), m_container(0) { }
|
||||
|
||||
template<typename ValueT, typename PosT, template <typename> class Allocator>
|
||||
StringVector<ValueT, PosT, Allocator>::StringIterator::StringIterator(
|
||||
StringVector<ValueT, PosT, Allocator> &sv, PosT index) : m_index(index),
|
||||
m_container(&sv) { }
|
||||
|
||||
template<typename ValueT, typename PosT, template <typename> class Allocator>
|
||||
PosT StringVector<ValueT, PosT, Allocator>::StringIterator::get_index()
|
||||
{
|
||||
return m_index;
|
||||
}
|
||||
|
||||
template<typename ValueT, typename PosT, template <typename> class Allocator>
|
||||
const std::string StringVector<ValueT, PosT, Allocator>::StringIterator::dereference() const
|
||||
{
|
||||
return StringVector<ValueT, PosT, Allocator>::range(m_container->begin(m_index),
|
||||
m_container->end(m_index)).str();
|
||||
}
|
||||
|
||||
template<typename ValueT, typename PosT, template <typename> class Allocator>
|
||||
bool StringVector<ValueT, PosT, Allocator>::StringIterator::equal(
|
||||
StringVector<ValueT, PosT, Allocator>::StringIterator const& other) const
|
||||
{
|
||||
return m_index == other.m_index && m_container == other.m_container;
|
||||
}
|
||||
|
||||
template<typename ValueT, typename PosT, template <typename> class Allocator>
|
||||
void StringVector<ValueT, PosT, Allocator>::StringIterator::increment()
|
||||
{
|
||||
m_index++;
|
||||
}
|
||||
|
||||
template<typename ValueT, typename PosT, template <typename> class Allocator>
|
||||
void StringVector<ValueT, PosT, Allocator>::StringIterator::decrement()
|
||||
{
|
||||
m_index--;
|
||||
}
|
||||
|
||||
template<typename ValueT, typename PosT, template <typename> class Allocator>
|
||||
void StringVector<ValueT, PosT, Allocator>::StringIterator::advance(PosT n)
|
||||
{
|
||||
m_index += n;
|
||||
}
|
||||
|
||||
template<typename ValueT, typename PosT, template <typename> class Allocator>
|
||||
PosT StringVector<ValueT, PosT, Allocator>::StringIterator::distance_to(
|
||||
StringVector<ValueT, PosT, Allocator>::StringIterator const& other) const
|
||||
{
|
||||
return other.m_index - m_index;
|
||||
}
|
||||
|
||||
// ********** Some typedefs **********
|
||||
|
||||
typedef StringVector<unsigned char, unsigned int> MediumStringVector;
|
||||
typedef StringVector<unsigned char, unsigned long> LongStringVector;
|
||||
|
||||
}
|
||||
|
||||
#endif
|
161
moses/src/CompactPT/TargetPhraseCollectionCache.h
Normal file
161
moses/src/CompactPT/TargetPhraseCollectionCache.h
Normal file
@ -0,0 +1,161 @@
|
||||
#ifndef moses_TargetPhraseCollectionCache_h
|
||||
#define moses_TargetPhraseCollectionCache_h
|
||||
|
||||
#include <map>
|
||||
#include <set>
|
||||
#include <vector>
|
||||
|
||||
#ifdef WITH_THREADS
|
||||
#ifdef BOOST_HAS_PTHREADS
|
||||
#include <boost/thread/mutex.hpp>
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#include <boost/shared_ptr.hpp>
|
||||
|
||||
#include "Phrase.h"
|
||||
#include "TargetPhraseCollection.h"
|
||||
|
||||
namespace Moses
|
||||
{
|
||||
|
||||
// Avoid using new due to locking
|
||||
typedef std::vector<TargetPhrase> TargetPhraseVector;
|
||||
typedef boost::shared_ptr<TargetPhraseVector> TargetPhraseVectorPtr;
|
||||
|
||||
class TargetPhraseCollectionCache
|
||||
{
|
||||
private:
|
||||
size_t m_max;
|
||||
float m_tolerance;
|
||||
|
||||
struct LastUsed {
|
||||
clock_t m_clock;
|
||||
TargetPhraseVectorPtr m_tpv;
|
||||
size_t m_bitsLeft;
|
||||
|
||||
LastUsed() : m_clock(0), m_bitsLeft(0) {}
|
||||
|
||||
LastUsed(clock_t clock, TargetPhraseVectorPtr tpv, size_t bitsLeft = 0)
|
||||
: m_clock(clock), m_tpv(tpv), m_bitsLeft(bitsLeft) {}
|
||||
};
|
||||
|
||||
typedef std::map<Phrase, LastUsed> CacheMap;
|
||||
|
||||
CacheMap m_phraseCache;
|
||||
|
||||
#ifdef WITH_THREADS
|
||||
boost::mutex m_mutex;
|
||||
#endif
|
||||
|
||||
public:
|
||||
|
||||
typedef CacheMap::iterator iterator;
|
||||
typedef CacheMap::const_iterator const_iterator;
|
||||
|
||||
TargetPhraseCollectionCache(size_t max = 5000, float tolerance = 0.2)
|
||||
: m_max(max), m_tolerance(tolerance)
|
||||
{}
|
||||
|
||||
iterator Begin()
|
||||
{
|
||||
return m_phraseCache.begin();
|
||||
}
|
||||
|
||||
const_iterator Begin() const
|
||||
{
|
||||
return m_phraseCache.begin();
|
||||
}
|
||||
|
||||
iterator End()
|
||||
{
|
||||
return m_phraseCache.end();
|
||||
}
|
||||
|
||||
const_iterator End() const
|
||||
{
|
||||
return m_phraseCache.end();
|
||||
}
|
||||
|
||||
void Cache(const Phrase &sourcePhrase, TargetPhraseVectorPtr tpv,
|
||||
size_t bitsLeft = 0, size_t maxRank = 0)
|
||||
{
|
||||
#ifdef WITH_THREADS
|
||||
boost::mutex::scoped_lock lock(m_mutex);
|
||||
#endif
|
||||
|
||||
iterator it = m_phraseCache.find(sourcePhrase);
|
||||
if(it != m_phraseCache.end())
|
||||
it->second.m_clock = clock();
|
||||
else
|
||||
{
|
||||
if(maxRank && tpv->size() > maxRank)
|
||||
{
|
||||
TargetPhraseVectorPtr tpv_temp(new TargetPhraseVector());
|
||||
tpv_temp->resize(maxRank);
|
||||
std::copy(tpv->begin(), tpv->begin() + maxRank, tpv_temp->begin());
|
||||
m_phraseCache[sourcePhrase] = LastUsed(clock(), tpv_temp, bitsLeft);
|
||||
}
|
||||
else
|
||||
m_phraseCache[sourcePhrase] = LastUsed(clock(), tpv, bitsLeft);
|
||||
}
|
||||
}
|
||||
|
||||
std::pair<TargetPhraseVectorPtr, size_t> Retrieve(const Phrase &sourcePhrase)
|
||||
{
|
||||
#ifdef WITH_THREADS
|
||||
boost::mutex::scoped_lock lock(m_mutex);
|
||||
#endif
|
||||
|
||||
iterator it = m_phraseCache.find(sourcePhrase);
|
||||
if(it != m_phraseCache.end())
|
||||
{
|
||||
LastUsed &lu = it->second;
|
||||
lu.m_clock = clock();
|
||||
return std::make_pair(lu.m_tpv, lu.m_bitsLeft);
|
||||
}
|
||||
else
|
||||
return std::make_pair(TargetPhraseVectorPtr(), 0);
|
||||
}
|
||||
|
||||
void Prune()
|
||||
{
|
||||
#ifdef WITH_THREADS
|
||||
boost::mutex::scoped_lock lock(m_mutex);
|
||||
#endif
|
||||
|
||||
if(m_phraseCache.size() > m_max * (1 + m_tolerance))
|
||||
{
|
||||
typedef std::set<std::pair<clock_t, Phrase> > Cands;
|
||||
Cands cands;
|
||||
for(CacheMap::iterator it = m_phraseCache.begin();
|
||||
it != m_phraseCache.end(); it++)
|
||||
{
|
||||
LastUsed &lu = it->second;
|
||||
cands.insert(std::make_pair(lu.m_clock, it->first));
|
||||
}
|
||||
|
||||
for(Cands::iterator it = cands.begin(); it != cands.end(); it++)
|
||||
{
|
||||
const Phrase& p = it->second;
|
||||
m_phraseCache.erase(p);
|
||||
|
||||
if(m_phraseCache.size() < (m_max * (1 - m_tolerance)))
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void CleanUp()
|
||||
{
|
||||
#ifdef WITH_THREADS
|
||||
boost::mutex::scoped_lock lock(m_mutex);
|
||||
#endif
|
||||
m_phraseCache.clear();
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
#endif
|
Loading…
Reference in New Issue
Block a user