// $Id$ // vim:tabstop=2 /*********************************************************************** Moses - factored phrase-based language decoder Copyright (C) 2006 University of Edinburgh This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ***********************************************************************/ #ifndef moses_BlockHashIndex_h #define moses_BlockHashIndex_h #include #include #include #include #include #include #include #include "MurmurHash3.h" #include "StringVector.h" #include "PackedArray.h" #include "util/exception.hh" #ifdef WITH_THREADS #include "moses/ThreadPool.h" #else #include #endif #include namespace Moses { class BlockHashIndex { private: std::priority_queue m_queue; size_t m_orderBits; size_t m_fingerPrintBits; std::FILE* m_fileHandle; size_t m_fileHandleStart; StringVector m_landmarks; std::vector m_hashes; std::vector m_clocks; std::vector*> m_arrays; std::vector m_seekIndex; size_t m_size; int m_lastSaved; int m_lastDropped; size_t m_numLoadedRanges; #ifdef WITH_THREADS ThreadPool m_threadPool; boost::mutex m_mutex; template class HashTask : public Task { public: HashTask(int id, BlockHashIndex& hash, Keys& keys) : m_id(id), m_hash(hash), m_keys(new Keys(keys)) {} virtual void Run() { m_hash.CalcHash(m_id, *m_keys); } virtual ~HashTask() { delete m_keys; } private: int m_id; BlockHashIndex& m_hash; Keys* m_keys; }; #endif size_t GetFprint(const char* key) const; size_t GetHash(size_t i, const char* key); public: #ifdef WITH_THREADS BlockHashIndex(size_t orderBits, size_t fingerPrintBits, size_t threadsNum = 2); #else BlockHashIndex(size_t orderBits, size_t fingerPrintBits); #endif ~BlockHashIndex(); size_t GetHash(const char* key); size_t GetHash(std::string key); size_t operator[](std::string key); size_t operator[](char* key); void BeginSave(std::FILE* mphf); void SaveRange(size_t i); void SaveLastRange(); size_t FinalizeSave(); #ifdef WITH_THREADS void WaitAll(); #endif void DropRange(size_t i); void DropLastRange(); size_t LoadIndex(std::FILE* mphf); void LoadRange(size_t i); size_t Save(std::string filename); size_t Save(std::FILE * mphf); size_t Load(std::string filename); size_t Load(std::FILE * mphf); size_t GetSize() const; void KeepNLastRanges(float ratio = 0.1, float tolerance = 0.1); template void AddRange(Keys &keys) { size_t current = m_landmarks.size(); if(m_landmarks.size() && m_landmarks.back().str() >= keys[0]) { std::stringstream strme; strme << "ERROR: Input file does not appear to be sorted with LC_ALL=C sort" << std::endl; strme << "1: " << m_landmarks.back().str() << std::endl; strme << "2: " << keys[0] << std::endl; UTIL_THROW2(strme.str()); } m_landmarks.push_back(keys[0]); m_size += keys.size(); if(keys.size() == 1) { // add dummy key to avoid null hash keys.push_back("###DUMMY_KEY###"); } #ifdef WITH_THREADS boost::shared_ptr > ht(new HashTask(current, *this, keys)); m_threadPool.Submit(ht); #else CalcHash(current, keys); #endif } template void CalcHash(size_t current, Keys &keys) { #ifdef HAVE_CMPH void* source = vectorAdapter(keys); CalcHash(current, source); #endif } void CalcHash(size_t current, void* source); #ifdef HAVE_CMPH void* vectorAdapter(std::vector& v); void* vectorAdapter(StringVector& sv); void* vectorAdapter(StringVector& sv); #endif }; } #endif