2013-05-29 21:16:15 +04:00
|
|
|
// $Id$
|
|
|
|
// vim:tabstop=2
|
|
|
|
/***********************************************************************
|
|
|
|
Moses - factored phrase-based language decoder
|
|
|
|
Copyright (C) 2006 University of Edinburgh
|
|
|
|
|
|
|
|
This library is free software; you can redistribute it and/or
|
|
|
|
modify it under the terms of the GNU Lesser General Public
|
|
|
|
License as published by the Free Software Foundation; either
|
|
|
|
version 2.1 of the License, or (at your option) any later version.
|
|
|
|
|
|
|
|
This library is distributed in the hope that it will be useful,
|
|
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
|
|
Lesser General Public License for more details.
|
|
|
|
|
|
|
|
You should have received a copy of the GNU Lesser General Public
|
|
|
|
License along with this library; if not, write to the Free Software
|
|
|
|
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
|
|
***********************************************************************/
|
2012-08-03 18:38:45 +04:00
|
|
|
|
2012-08-02 20:32:55 +04:00
|
|
|
#ifndef moses_BlockHashIndex_h
|
|
|
|
#define moses_BlockHashIndex_h
|
|
|
|
|
|
|
|
#include <iostream>
|
2014-01-13 22:32:22 +04:00
|
|
|
#include <sstream>
|
2012-08-02 20:32:55 +04:00
|
|
|
#include <string>
|
|
|
|
#include <vector>
|
2013-05-29 21:16:15 +04:00
|
|
|
#include <queue>
|
2012-08-02 20:32:55 +04:00
|
|
|
#include <cstring>
|
|
|
|
#include <cstdio>
|
|
|
|
|
|
|
|
#include "MurmurHash3.h"
|
|
|
|
#include "StringVector.h"
|
|
|
|
#include "PackedArray.h"
|
2014-01-13 18:37:05 +04:00
|
|
|
#include "util/exception.hh"
|
2012-08-02 20:32:55 +04:00
|
|
|
|
|
|
|
#ifdef WITH_THREADS
|
2012-11-12 23:56:18 +04:00
|
|
|
#include "moses/ThreadPool.h"
|
2013-12-19 00:15:39 +04:00
|
|
|
#else
|
2015-03-28 16:09:03 +03:00
|
|
|
#include <ctime>
|
2012-08-02 20:32:55 +04:00
|
|
|
#endif
|
|
|
|
|
2015-03-21 19:12:52 +03:00
|
|
|
#include <boost/shared_ptr.hpp>
|
|
|
|
|
2012-08-02 20:32:55 +04:00
|
|
|
namespace Moses
|
|
|
|
{
|
|
|
|
|
|
|
|
class BlockHashIndex
|
|
|
|
{
|
2013-05-29 21:16:15 +04:00
|
|
|
private:
|
|
|
|
std::priority_queue<int> m_queue;
|
|
|
|
|
|
|
|
size_t m_orderBits;
|
|
|
|
size_t m_fingerPrintBits;
|
|
|
|
|
|
|
|
std::FILE* m_fileHandle;
|
|
|
|
size_t m_fileHandleStart;
|
|
|
|
|
|
|
|
StringVector<unsigned char, unsigned long> m_landmarks;
|
|
|
|
|
|
|
|
std::vector<void*> m_hashes;
|
|
|
|
std::vector<clock_t> m_clocks;
|
|
|
|
std::vector<PairedPackedArray<>*> m_arrays;
|
|
|
|
|
|
|
|
std::vector<size_t> m_seekIndex;
|
|
|
|
|
|
|
|
size_t m_size;
|
|
|
|
int m_lastSaved;
|
|
|
|
int m_lastDropped;
|
|
|
|
size_t m_numLoadedRanges;
|
|
|
|
|
2012-08-02 20:32:55 +04:00
|
|
|
#ifdef WITH_THREADS
|
2013-05-29 21:16:15 +04:00
|
|
|
ThreadPool m_threadPool;
|
|
|
|
boost::mutex m_mutex;
|
|
|
|
|
|
|
|
template <typename Keys>
|
|
|
|
class HashTask : public Task
|
|
|
|
{
|
2012-08-02 20:32:55 +04:00
|
|
|
public:
|
2013-05-29 21:16:15 +04:00
|
|
|
HashTask(int id, BlockHashIndex& hash, Keys& keys)
|
|
|
|
: m_id(id), m_hash(hash), m_keys(new Keys(keys)) {}
|
|
|
|
|
|
|
|
virtual void Run() {
|
|
|
|
m_hash.CalcHash(m_id, *m_keys);
|
|
|
|
}
|
|
|
|
|
|
|
|
virtual ~HashTask() {
|
|
|
|
delete m_keys;
|
|
|
|
}
|
|
|
|
|
|
|
|
private:
|
|
|
|
int m_id;
|
|
|
|
BlockHashIndex& m_hash;
|
|
|
|
Keys* m_keys;
|
|
|
|
};
|
|
|
|
#endif
|
|
|
|
|
|
|
|
size_t GetFprint(const char* key) const;
|
|
|
|
size_t GetHash(size_t i, const char* key);
|
|
|
|
|
|
|
|
public:
|
2012-08-02 20:32:55 +04:00
|
|
|
#ifdef WITH_THREADS
|
2013-05-29 21:16:15 +04:00
|
|
|
BlockHashIndex(size_t orderBits, size_t fingerPrintBits,
|
|
|
|
size_t threadsNum = 2);
|
2012-08-02 20:32:55 +04:00
|
|
|
#else
|
2013-05-29 21:16:15 +04:00
|
|
|
BlockHashIndex(size_t orderBits, size_t fingerPrintBits);
|
2012-08-02 20:32:55 +04:00
|
|
|
#endif
|
|
|
|
|
2013-05-29 21:16:15 +04:00
|
|
|
~BlockHashIndex();
|
|
|
|
|
|
|
|
size_t GetHash(const char* key);
|
|
|
|
size_t GetHash(std::string key);
|
|
|
|
|
|
|
|
size_t operator[](std::string key);
|
|
|
|
size_t operator[](char* key);
|
|
|
|
|
|
|
|
void BeginSave(std::FILE* mphf);
|
|
|
|
void SaveRange(size_t i);
|
|
|
|
void SaveLastRange();
|
|
|
|
size_t FinalizeSave();
|
2012-08-02 20:32:55 +04:00
|
|
|
|
|
|
|
#ifdef WITH_THREADS
|
2013-05-29 21:16:15 +04:00
|
|
|
void WaitAll();
|
2012-08-02 20:32:55 +04:00
|
|
|
#endif
|
2013-05-29 21:16:15 +04:00
|
|
|
|
|
|
|
void DropRange(size_t i);
|
|
|
|
void DropLastRange();
|
|
|
|
|
|
|
|
size_t LoadIndex(std::FILE* mphf);
|
|
|
|
void LoadRange(size_t i);
|
|
|
|
|
|
|
|
size_t Save(std::string filename);
|
|
|
|
size_t Save(std::FILE * mphf);
|
|
|
|
|
|
|
|
size_t Load(std::string filename);
|
|
|
|
size_t Load(std::FILE * mphf);
|
|
|
|
|
|
|
|
size_t GetSize() const;
|
|
|
|
|
|
|
|
void KeepNLastRanges(float ratio = 0.1, float tolerance = 0.1);
|
|
|
|
|
|
|
|
template <typename Keys>
|
|
|
|
void AddRange(Keys &keys) {
|
|
|
|
size_t current = m_landmarks.size();
|
|
|
|
|
|
|
|
if(m_landmarks.size() && m_landmarks.back().str() >= keys[0]) {
|
2014-01-13 22:32:22 +04:00
|
|
|
std::stringstream strme;
|
|
|
|
strme << "ERROR: Input file does not appear to be sorted with LC_ALL=C sort" << std::endl;
|
|
|
|
strme << "1: " << m_landmarks.back().str() << std::endl;
|
|
|
|
strme << "2: " << keys[0] << std::endl;
|
|
|
|
UTIL_THROW2(strme.str());
|
2013-05-29 21:16:15 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
m_landmarks.push_back(keys[0]);
|
|
|
|
m_size += keys.size();
|
|
|
|
|
|
|
|
if(keys.size() == 1) {
|
|
|
|
// add dummy key to avoid null hash
|
|
|
|
keys.push_back("###DUMMY_KEY###");
|
|
|
|
}
|
|
|
|
|
2012-08-02 20:32:55 +04:00
|
|
|
#ifdef WITH_THREADS
|
2015-03-21 19:12:52 +03:00
|
|
|
|
|
|
|
boost::shared_ptr<HashTask<Keys> >
|
|
|
|
ht(new HashTask<Keys>(current, *this, keys));
|
2013-05-29 21:16:15 +04:00
|
|
|
m_threadPool.Submit(ht);
|
2012-08-02 20:32:55 +04:00
|
|
|
#else
|
2013-05-29 21:16:15 +04:00
|
|
|
CalcHash(current, keys);
|
2012-08-02 20:32:55 +04:00
|
|
|
#endif
|
2013-05-29 21:16:15 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
template <typename Keys>
|
|
|
|
void CalcHash(size_t current, Keys &keys) {
|
|
|
|
#ifdef HAVE_CMPH
|
|
|
|
void* source = vectorAdapter(keys);
|
|
|
|
CalcHash(current, source);
|
2012-08-02 20:32:55 +04:00
|
|
|
#endif
|
2013-05-29 21:16:15 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
void CalcHash(size_t current, void* source);
|
2012-08-02 20:32:55 +04:00
|
|
|
|
2013-05-29 21:16:15 +04:00
|
|
|
#ifdef HAVE_CMPH
|
|
|
|
void* vectorAdapter(std::vector<std::string>& v);
|
|
|
|
void* vectorAdapter(StringVector<unsigned, size_t, std::allocator>& sv);
|
|
|
|
void* vectorAdapter(StringVector<unsigned, size_t, MmapAllocator>& sv);
|
2012-08-04 17:39:30 +04:00
|
|
|
#endif
|
2012-08-02 20:32:55 +04:00
|
|
|
};
|
|
|
|
|
|
|
|
}
|
2012-08-09 21:53:27 +04:00
|
|
|
#endif
|