// $Id$ // vim:tabstop=2 /*********************************************************************** Moses - factored phrase-based language decoder Copyright (C) 2006 University of Edinburgh This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ***********************************************************************/ #ifndef moses_CanonicalHuffman_h #define moses_CanonicalHuffman_h #include #include #include #include #include "ThrowingFwrite.h" namespace Moses { template class CanonicalHuffman { private: std::vector m_symbols; std::vector m_firstCodes; std::vector m_lengthIndex; typedef boost::unordered_map > EncodeMap; EncodeMap m_encodeMap; struct MinHeapSorter { std::vector& m_vec; MinHeapSorter(std::vector& vec) : m_vec(vec) { } bool operator()(size_t a, size_t b) { return m_vec[a] > m_vec[b]; } }; template void CalcLengths(Iterator begin, Iterator end, std::vector& lengths) { size_t n = std::distance(begin, end); std::vector A(2 * n, 0); m_symbols.resize(n); size_t i = 0; for(Iterator it = begin; it != end; it++) { m_symbols[i] = it->first; A[i] = n + i; A[n + i] = it->second; i++; } if(n == 1) { lengths.push_back(1); return; } MinHeapSorter hs(A); std::make_heap(A.begin(), A.begin() + n, hs); size_t h = n; size_t m1, m2; while(h > 1) { m1 = A[0]; std::pop_heap(A.begin(), A.begin() + h, hs); h--; m2 = A[0]; std::pop_heap(A.begin(), A.begin() + h, hs); A[h] = A[m1] + A[m2]; A[h-1] = h; A[m1] = A[m2] = h; std::push_heap(A.begin(), A.begin() + h, hs); } A[1] = 0; for(size_t i = 2; i < 2*n; i++) A[i] = A[A[i]] + 1; lengths.resize(n); for(size_t i = 0; i < n; i++) lengths[i] = A[i + n]; } void CalcCodes(std::vector& lengths) { std::vector numLength; for(std::vector::iterator it = lengths.begin(); it != lengths.end(); it++) { size_t length = *it; if(numLength.size() <= length) numLength.resize(length + 1, 0); numLength[length]++; } m_lengthIndex.resize(numLength.size()); m_lengthIndex[0] = 0; for(size_t l = 1; l < numLength.size(); l++) m_lengthIndex[l] = m_lengthIndex[l - 1] + numLength[l - 1]; size_t maxLength = numLength.size() - 1; m_firstCodes.resize(maxLength + 1, 0); for(size_t l = maxLength - 1; l > 0; l--) m_firstCodes[l] = (m_firstCodes[l + 1] + numLength[l + 1]) / 2; std::vector t_symbols; t_symbols.resize(lengths.size()); std::vector nextCode = m_firstCodes; for(size_t i = 0; i < lengths.size(); i++) { Data data = m_symbols[i]; size_t length = lengths[i]; size_t pos = m_lengthIndex[length] + (nextCode[length] - m_firstCodes[length]); t_symbols[pos] = data; nextCode[length] = nextCode[length] + 1; } m_symbols.swap(t_symbols); } void CreateCodeMap() { for(size_t l = 1; l < m_lengthIndex.size(); l++) { size_t intCode = m_firstCodes[l]; size_t num = ((l+1 < m_lengthIndex.size()) ? m_lengthIndex[l+1] : m_symbols.size()) - m_lengthIndex[l]; for(size_t i = 0; i < num; i++) { Data data = m_symbols[m_lengthIndex[l] + i]; boost::dynamic_bitset<> bitCode(l, intCode); m_encodeMap[data] = bitCode; intCode++; } } } boost::dynamic_bitset<>& Encode(Data data) { return m_encodeMap[data]; } template void PutCode(BitWrapper& bitWrapper, boost::dynamic_bitset<>& code) { for(int j = code.size()-1; j >= 0; j--) bitWrapper.Put(code[j]); } public: template CanonicalHuffman(Iterator begin, Iterator end, bool forEncoding = true) { std::vector lengths; CalcLengths(begin, end, lengths); CalcCodes(lengths); if(forEncoding) CreateCodeMap(); } CanonicalHuffman(std::FILE* pFile, bool forEncoding = false) { Load(pFile); if(forEncoding) CreateCodeMap(); } template void Put(BitWrapper& bitWrapper, Data data) { PutCode(bitWrapper, Encode(data)); } template Data Read(BitWrapper& bitWrapper) { if(bitWrapper.TellFromEnd()) { size_t intCode = bitWrapper.Read(); size_t len = 1; while(intCode < m_firstCodes[len]) { intCode = 2 * intCode + bitWrapper.Read(); len++; } return m_symbols[m_lengthIndex[len] + (intCode - m_firstCodes[len])]; } return Data(); } size_t Load(std::FILE* pFile) { size_t start = std::ftell(pFile); size_t read = 0; size_t size; read += std::fread(&size, sizeof(size_t), 1, pFile); m_symbols.resize(size); read += std::fread(&m_symbols[0], sizeof(Data), size, pFile); read += std::fread(&size, sizeof(size_t), 1, pFile); m_firstCodes.resize(size); read += std::fread(&m_firstCodes[0], sizeof(size_t), size, pFile); read += std::fread(&size, sizeof(size_t), 1, pFile); m_lengthIndex.resize(size); read += std::fread(&m_lengthIndex[0], sizeof(size_t), size, pFile); return std::ftell(pFile) - start; } size_t Save(std::FILE* pFile) { size_t start = std::ftell(pFile); size_t size = m_symbols.size(); ThrowingFwrite(&size, sizeof(size_t), 1, pFile); ThrowingFwrite(&m_symbols[0], sizeof(Data), size, pFile); size = m_firstCodes.size(); ThrowingFwrite(&size, sizeof(size_t), 1, pFile); ThrowingFwrite(&m_firstCodes[0], sizeof(size_t), size, pFile); size = m_lengthIndex.size(); ThrowingFwrite(&size, sizeof(size_t), 1, pFile); ThrowingFwrite(&m_lengthIndex[0], sizeof(size_t), size, pFile); return std::ftell(pFile) - start; } }; template class BitWrapper { private: Container& m_data; typename Container::iterator m_iterator; typename Container::value_type m_currentValue; size_t m_valueBits; typename Container::value_type m_mask; size_t m_bitPos; public: BitWrapper(Container &data) : m_data(data), m_iterator(m_data.begin()), m_currentValue(0), m_valueBits(sizeof(typename Container::value_type) * 8), m_mask(1), m_bitPos(0) { } bool Read() { if(m_bitPos % m_valueBits == 0) { if(m_iterator != m_data.end()) m_currentValue = *m_iterator++; } else m_currentValue = m_currentValue >> 1; m_bitPos++; return (m_currentValue & m_mask); } void Put(bool bit) { if(m_bitPos % m_valueBits == 0) m_data.push_back(0); if(bit) m_data[m_data.size()-1] |= m_mask << (m_bitPos % m_valueBits); m_bitPos++; } size_t Tell() { return m_bitPos; } size_t TellFromEnd() { if(m_data.size() * m_valueBits < m_bitPos) return 0; return m_data.size() * m_valueBits - m_bitPos; } void Seek(size_t bitPos) { m_bitPos = bitPos; m_iterator = m_data.begin() + int((m_bitPos-1)/m_valueBits); m_currentValue = (*m_iterator) >> ((m_bitPos-1) % m_valueBits); m_iterator++; } void SeekFromEnd(size_t bitPosFromEnd) { size_t bitPos = m_data.size() * m_valueBits - bitPosFromEnd; Seek(bitPos); } void Reset() { m_iterator = m_data.begin(); m_currentValue = 0; m_bitPos = 0; } Container& GetContainer() { return m_data; } }; } #endif