mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2025-01-02 17:09:36 +03:00
389 lines
11 KiB
C++
389 lines
11 KiB
C++
/**
|
|
* LossyCounter - implementation of Lossy Counting algorithm as described in:
|
|
* Approximate Frequency Counts over Data Streams, G.S.Manku & R.Motwani, (2002)
|
|
*
|
|
* (C) Ceslav Przywara, UFAL MFF UK, 2011
|
|
*
|
|
* Implementation note: define USE_UNORDERED_MAP to use std::tr1::unordered_map
|
|
* instead std::map for storage of lossy counted items.
|
|
*
|
|
* $Id$
|
|
*/
|
|
|
|
#ifndef LOSSYCOUNTER_H
|
|
#define LOSSYCOUNTER_H
|
|
|
|
#include <stddef.h>
|
|
#include <math.h>
|
|
#ifdef USE_UNORDERED_MAP
|
|
#include <tr1/unordered_map>
|
|
#else
|
|
#include <map>
|
|
#endif
|
|
#include <iterator>
|
|
|
|
|
|
// Iterators:
|
|
template<class T>
|
|
class LossyCounterIterator;
|
|
|
|
template<class T>
|
|
class LossyCounterErasingIterator;
|
|
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
/////////////////////////// Lossy Counter Interface ////////////////////////////
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
|
|
/**
|
|
* Implementation of Lossy Counting algorithm as described in:
|
|
* Approximate Frequency Counts over Data Streams, G.S.Manku & R.Motwani, (2002)
|
|
*/
|
|
template<class T>
|
|
class LossyCounter {
|
|
|
|
public:
|
|
|
|
// Error parameter type definition.
|
|
typedef double error_t;
|
|
|
|
// Support parameter type definition.
|
|
typedef double support_t;
|
|
|
|
// Counters type definition.
|
|
typedef size_t counter_t;
|
|
|
|
// Frequency counter type definition (f).
|
|
typedef counter_t frequency_t;
|
|
|
|
// Maximum error counter type definition (Δ).
|
|
typedef counter_t maximum_error_t;
|
|
|
|
// Pair: frequency (f) and possible maximum error (Δ).
|
|
typedef std::pair<frequency_t, maximum_error_t> item_counts_t;
|
|
#ifdef USE_UNORDERED_MAP
|
|
typedef std::tr1::unordered_map<T, item_counts_t, typename T::Hash> storage_t;
|
|
#else
|
|
// Mapping: counted item and its frequency and max. error counts.
|
|
typedef std::map<T, item_counts_t> storage_t;
|
|
#endif
|
|
// We provide own version of const iterator.
|
|
typedef LossyCounterIterator<T> const_iterator;
|
|
|
|
// Special type of iterator: leaves no item behind!
|
|
typedef LossyCounterErasingIterator<T> erasing_iterator;
|
|
|
|
/** @var Error parameter value (ε) */
|
|
const error_t error;
|
|
|
|
/** @var Supprort parameter value (s) */
|
|
const support_t support;
|
|
|
|
/** @var Width of single bucket (w) */
|
|
const counter_t bucketWidth; // ceil(1/error)
|
|
|
|
private:
|
|
|
|
/** @var Current epoch bucket ID (b-current) */
|
|
counter_t _bucketId;
|
|
|
|
/** @var Count of items read in so far (N) */
|
|
counter_t _count;
|
|
|
|
/** @var Items storage (Ɗ) */
|
|
storage_t _storage;
|
|
|
|
public:
|
|
|
|
/**
|
|
* Set error to 0 to disable lossy-pruning.
|
|
* @param _error Value from interval [0.0, 1.0).
|
|
* @param _support Value from interval [0.0, 1.0).
|
|
*/
|
|
LossyCounter<T>(error_t _error, support_t _support):
|
|
error(_error), support(_support), bucketWidth(_error > 0.0 ? ceil(1/_error) : 0), _bucketId(1), _count(0), _storage() {}
|
|
|
|
/**
|
|
* @param item Item to be added to storage.
|
|
*/
|
|
void add(const T& item);
|
|
|
|
/**
|
|
* @return Constant iterator pointing to the beginning of storage.
|
|
*/
|
|
const_iterator begin(void) const { return LossyCounterIterator<T>(threshold(), _storage.begin(), _storage.end()); }
|
|
|
|
/**
|
|
* @return Constant iterator pointing to the end of storage.
|
|
*/
|
|
const_iterator end(void) const { return LossyCounterIterator<T>(_storage.end()); }
|
|
|
|
/**
|
|
* @return Erasing iterator pointing to the beginning of storage.
|
|
*/
|
|
erasing_iterator beginErase(void) { return LossyCounterErasingIterator<T>(threshold(), _storage); }
|
|
|
|
/**
|
|
* @return Erasing iterator pointing to the end of storage.
|
|
*/
|
|
erasing_iterator endErase(void) { return LossyCounterErasingIterator<T>(_storage); }
|
|
|
|
/**
|
|
* @return Current bucket ID.
|
|
*/
|
|
counter_t bucketId(void) const { return _bucketId; }
|
|
|
|
/**
|
|
* @return Number of items added to storage so far (N).
|
|
*/
|
|
counter_t count(void) const { return _count; }
|
|
|
|
/**
|
|
* @return Number of items currently in storage.
|
|
*/
|
|
size_t size(void) const { return _storage.size(); }
|
|
|
|
/**
|
|
* @param positive Return sN value instead of (s-ε)N?
|
|
* @return Threshold (either positive or negative) value.
|
|
*/
|
|
double threshold(bool positive = false) const { return positive ? support * _count : (support - error) * _count; }
|
|
|
|
/**
|
|
* @return The maximum value of which estimated frequencies are less than the true frequencies.
|
|
*/
|
|
double maxError(void) const { return error * _count; }
|
|
|
|
/**
|
|
* @return True, if it's prunning time right now!
|
|
*/
|
|
bool aboutToPrune(void) const { return (bucketWidth != 0) && ((_count % bucketWidth) == 0); }
|
|
|
|
private:
|
|
|
|
/**
|
|
* Prunes counts table.
|
|
*/
|
|
void prune(void);
|
|
|
|
};
|
|
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
/////////////////////// Lossy Counter Iterator Interface ///////////////////////
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
|
|
/**
|
|
* Lossy counter iterator is designed to iterate over items passing the current
|
|
* threshold.
|
|
*/
|
|
template<class T>
|
|
class LossyCounterIterator: public std::iterator<std::forward_iterator_tag, typename LossyCounter<T>::storage_t::value_type> {
|
|
public:
|
|
|
|
typedef LossyCounterIterator<T> self_type;
|
|
|
|
typedef typename LossyCounter<T>::storage_t::const_iterator const_iterator;
|
|
|
|
protected:
|
|
|
|
/** @var Minimum frequency threshold */
|
|
const double _threshold;
|
|
|
|
/** @var Current position of iterator (based on underlying container) */
|
|
const_iterator _current;
|
|
|
|
/** @var End position in underlying container */
|
|
const const_iterator _end;
|
|
|
|
public:
|
|
|
|
// Constructors.
|
|
|
|
LossyCounterIterator<T>(const_iterator end):
|
|
_threshold(0), _current(_end), _end(end) {}
|
|
|
|
LossyCounterIterator<T>(double threshold, const_iterator begin, const_iterator end):
|
|
_threshold(threshold), _current(begin), _end(end) {
|
|
// Forward the iterator to the first valid item (with frequency above threshold)!
|
|
this->forward(true);
|
|
}
|
|
|
|
// Operators.
|
|
|
|
/**
|
|
* Only items passing the threshold are included in iteration.
|
|
*/
|
|
LossyCounterIterator<T> operator++(void); // ++this
|
|
|
|
/**
|
|
* Only items passing the threshold are included in iteration.
|
|
*/
|
|
LossyCounterIterator<T> operator++(int junk); // this++
|
|
|
|
bool operator==(const self_type& rhs) const { return _current == rhs._current; }
|
|
|
|
bool operator!=(const self_type& rhs) const { return _current != rhs._current; }
|
|
|
|
// Interface.
|
|
|
|
/**
|
|
* @return Current item.
|
|
*/
|
|
const T& item(void) const { return _current->first; }
|
|
|
|
/**
|
|
* @return Current item's frequency.
|
|
*/
|
|
typename LossyCounter<T>::frequency_t frequency(void) const { return _current->second.first; }
|
|
|
|
/**
|
|
* @return Current item's maximum error.
|
|
*/
|
|
typename LossyCounter<T>::error_t max_error(void) const { return _current->second.second; }
|
|
|
|
protected:
|
|
|
|
/**
|
|
* @param init Check also the item that iterator _current points to? Useful when initializing.
|
|
*/
|
|
void forward(bool init);
|
|
|
|
};
|
|
|
|
/**
|
|
* Lossy counter iterator erasing all items passed by.
|
|
*/
|
|
template<class T>
|
|
class LossyCounterErasingIterator: public LossyCounterIterator<T> {
|
|
public:
|
|
|
|
typedef typename LossyCounter<T>::storage_t storage_t;
|
|
|
|
private:
|
|
|
|
storage_t& _storage;
|
|
|
|
public:
|
|
|
|
// Constructors - have to be aware of the "mother" container.
|
|
|
|
LossyCounterErasingIterator<T>(storage_t& storage): LossyCounterIterator<T>(storage.end()), _storage(storage) {}
|
|
|
|
LossyCounterErasingIterator<T>(double threshold, storage_t& storage): LossyCounterIterator<T>(threshold, storage.begin(), storage.end()), _storage(storage) {}
|
|
|
|
protected:
|
|
|
|
/**
|
|
* @param init Check also the item that iterator _current points to? Useful when initializing.
|
|
*/
|
|
void forward(bool init);
|
|
|
|
};
|
|
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
//////////////////////// Lossy Counter Implementation //////////////////////////
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
|
|
template<class T>
|
|
void LossyCounter<T>::add(const T& item) {
|
|
|
|
typename storage_t::iterator iter = _storage.find(item);
|
|
|
|
if ( iter == _storage.end() ) {
|
|
// Insert new item with appropriate frequency and maximum-possible-error.
|
|
_storage.insert(std::make_pair(item, item_counts_t(1, _bucketId - 1)));
|
|
}
|
|
else {
|
|
// Update frequency of existing.
|
|
iter->second.first += 1;
|
|
}
|
|
|
|
// Finally increment the counter and check if the table shall be pruned.
|
|
++_count;
|
|
if ( this->aboutToPrune() ) {
|
|
this->prune();
|
|
++_bucketId;
|
|
}
|
|
}
|
|
|
|
|
|
template<class T>
|
|
void LossyCounter<T>::prune(void) {
|
|
|
|
for ( typename storage_t::iterator iter = _storage.begin(); iter != _storage.end(); /* Incrementation step missing intentionally! */ ) {
|
|
// Prune, if: maximum possible error + frequency <= ID of current bucket
|
|
if ( (iter->second.first + iter->second.second) <= _bucketId ) {
|
|
_storage.erase(iter++); // Post increment!
|
|
}
|
|
else {
|
|
++iter;
|
|
}
|
|
}
|
|
|
|
}
|
|
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
//////////////////// Lossy Counter Iterator Implementation /////////////////////
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
|
|
template<class T>
|
|
LossyCounterIterator<T> LossyCounterIterator<T>::operator++(void) {
|
|
this->forward();
|
|
return *this;
|
|
}
|
|
|
|
|
|
template<class T>
|
|
LossyCounterIterator<T> LossyCounterIterator<T>::operator++(int junk) {
|
|
self_type iter = *this;
|
|
this->forward();
|
|
return iter;
|
|
}
|
|
|
|
template<class T>
|
|
void LossyCounterIterator<T>::forward(bool init = false) {
|
|
if ( _current == _end ) {
|
|
return; // Nowhere to go, we're at the end already.
|
|
}
|
|
if ( init && (this->frequency() >= _threshold) ) {
|
|
// Ok, we're initing and we're already at the item passing the threshold.
|
|
return;
|
|
}
|
|
// Keep going until we reach the end or the next item with frequency NOT below the threshold.
|
|
while ( (++_current != _end) && (this->frequency() < _threshold) );
|
|
}
|
|
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
//////////////// Lossy Counter Erasing Iterator Implementation /////////////////
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
|
|
template<class T>
|
|
void LossyCounterErasingIterator<T>::forward(bool init = false) {
|
|
if (this->_current == this->_end ) {
|
|
return; // Nowhere to go, we're at the end already.
|
|
}
|
|
if ( init && (this->frequency() >= this->_threshold) ) {
|
|
// Ok, we're initing and we're already at the item passing the threshold.
|
|
return;
|
|
}
|
|
// This is where erasing forward() and std forward() differ:
|
|
typename LossyCounterIterator<T>::const_iterator previous = this->_current;
|
|
// Keep going...
|
|
while ( ++this->_current != this->_end ) {
|
|
if ( this->frequency() < this->_threshold ) {
|
|
// Get rid of previous, cause we'll go on.
|
|
_storage.erase(previous);
|
|
previous = this->_current;
|
|
} else {
|
|
break;
|
|
}
|
|
}
|
|
_storage.erase(previous); // !
|
|
}
|
|
|
|
#endif /* LOSSYCOUNTER_H */
|