2008-06-11 14:52:57 +04:00
// $Id$
/***********************************************************************
Moses - factored phrase - based language decoder
Copyright ( C ) 2006 University of Edinburgh
This library is free software ; you can redistribute it and / or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation ; either
version 2.1 of the License , or ( at your option ) any later version .
This library is distributed in the hope that it will be useful ,
but WITHOUT ANY WARRANTY ; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE . See the GNU
Lesser General Public License for more details .
You should have received a copy of the GNU Lesser General Public
License along with this library ; if not , write to the Free Software
Foundation , Inc . , 51 Franklin Street , Fifth Floor , Boston , MA 02110 - 1301 USA
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
2010-02-24 14:15:44 +03:00
# ifndef moses_WordsBitmap_h
# define moses_WordsBitmap_h
2008-06-11 14:52:57 +04:00
2015-07-16 13:56:20 +03:00
# include <algorithm>
2008-06-11 14:52:57 +04:00
# include <limits>
# include <vector>
# include <iostream>
# include <cstring>
# include <cmath>
2009-07-23 14:29:30 +04:00
# include <cstdlib>
2008-06-11 14:52:57 +04:00
# include "TypeDef.h"
2015-10-25 16:37:59 +03:00
# include "Range.h"
2008-06-11 14:52:57 +04:00
2008-10-09 03:51:26 +04:00
namespace Moses
{
2008-12-13 15:08:55 +03:00
typedef unsigned long WordsBitmapID ;
2008-10-09 03:51:26 +04:00
2015-07-16 13:56:20 +03:00
/** Vector of boolean to represent whether a word has been translated or not.
*
2015-07-17 15:23:47 +03:00
* Implemented using a vector of char , which is usually the same representation
* for the elements that a C array of bool would use . A vector of bool , or a
* Boost dynamic_bitset , could be much more efficient in theory . Unfortunately
* algorithms like std : : find ( ) are not optimized for vector < bool > on gcc or
* clang , and dynamic_bitset lacks all the optimized search operations we want .
* Only benchmarking will tell what works best . Perhaps dynamic_bitset could
* still be a dramatic improvement , if we flip the meaning of the bits around
* so we can use its find_first ( ) and find_next ( ) for the most common searches .
2015-07-16 13:56:20 +03:00
*/
2015-10-25 16:07:25 +03:00
class Bitmap
2008-06-11 14:52:57 +04:00
{
2015-10-25 16:07:25 +03:00
friend std : : ostream & operator < < ( std : : ostream & out , const Bitmap & bitmap ) ;
2015-07-16 13:56:20 +03:00
private :
std : : vector < char > m_bitmap ; //! Ticks of words in sentence that have been done.
size_t m_firstGap ; //! Cached position of first gap, or NOT_FOUND.
2015-10-26 12:50:27 +03:00
size_t m_numWordsCovered ;
2011-02-24 16:14:42 +03:00
2015-10-25 16:07:25 +03:00
Bitmap ( ) ; // not implemented
Bitmap & operator = ( const Bitmap & other ) ;
2011-02-24 16:14:42 +03:00
2015-05-22 17:31:23 +03:00
/** Update the first gap, when bits are flipped */
void UpdateFirstGap ( size_t startPos , size_t endPos , bool value ) {
if ( value ) {
//may remove gap
if ( startPos < = m_firstGap & & m_firstGap < = endPos ) {
m_firstGap = NOT_FOUND ;
2015-07-16 13:56:20 +03:00
for ( size_t i = endPos + 1 ; i < m_bitmap . size ( ) ; + + i ) {
2015-05-22 17:31:23 +03:00
if ( ! m_bitmap [ i ] ) {
m_firstGap = i ;
break ;
}
}
}
} else {
//setting positions to false, may add new gap
if ( startPos < m_firstGap ) {
m_firstGap = startPos ;
}
2011-02-24 16:14:42 +03:00
}
}
2010-05-04 01:39:23 +04:00
2015-10-26 12:20:08 +03:00
//! set value between 2 positions, inclusive
void
2015-10-26 12:50:27 +03:00
SetValueNonOverlap ( Range const & range ) {
2015-10-26 15:03:54 +03:00
size_t startPos = range . GetStartPos ( ) ;
size_t endPos = range . GetEndPos ( ) ;
2015-10-26 12:20:08 +03:00
2015-10-26 15:03:54 +03:00
for ( size_t pos = startPos ; pos < = endPos ; pos + + ) {
m_bitmap [ pos ] = true ;
}
2015-10-26 12:50:27 +03:00
2015-10-26 15:03:54 +03:00
m_numWordsCovered + = range . GetNumWordsCovered ( ) ;
UpdateFirstGap ( startPos , endPos , true ) ;
2015-10-26 12:20:08 +03:00
}
2010-05-04 01:39:23 +04:00
2008-06-11 14:52:57 +04:00
public :
2015-10-25 16:07:25 +03:00
//! Create Bitmap of length size, and initialise with vector.
2015-10-26 14:59:20 +03:00
explicit Bitmap ( size_t size , const std : : vector < bool > & initializer ) ;
2015-07-16 13:56:20 +03:00
2015-10-25 16:07:25 +03:00
//! Create Bitmap of length size and initialise.
2015-10-26 14:59:20 +03:00
explicit Bitmap ( size_t size ) ;
2015-07-16 13:56:20 +03:00
//! Deep copy.
2015-10-26 14:59:20 +03:00
explicit Bitmap ( const Bitmap & copy ) ;
2015-07-16 13:56:20 +03:00
2015-10-26 14:59:20 +03:00
explicit Bitmap ( const Bitmap & copy , const Range & range ) ;
2015-10-26 12:20:08 +03:00
2015-07-16 13:56:20 +03:00
//! Count of words translated.
2011-02-24 16:14:42 +03:00
size_t GetNumWordsCovered ( ) const {
2015-10-26 12:50:27 +03:00
return m_numWordsCovered ;
2011-02-24 16:14:42 +03:00
}
//! position of 1st word not yet translated, or NOT_FOUND if everything already translated
size_t GetFirstGapPos ( ) const {
2015-05-22 17:31:23 +03:00
return m_firstGap ;
2011-02-24 16:14:42 +03:00
}
//! position of last word not yet translated, or NOT_FOUND if everything already translated
size_t GetLastGapPos ( ) const {
2015-07-16 13:56:20 +03:00
for ( int pos = int ( m_bitmap . size ( ) ) - 1 ; pos > = 0 ; pos - - ) {
2011-02-24 16:14:42 +03:00
if ( ! m_bitmap [ pos ] ) {
return pos ;
}
}
// no starting pos
return NOT_FOUND ;
}
//! position of last translated word
size_t GetLastPos ( ) const {
2015-07-16 13:56:20 +03:00
for ( int pos = int ( m_bitmap . size ( ) ) - 1 ; pos > = 0 ; pos - - ) {
2011-02-24 16:14:42 +03:00
if ( m_bitmap [ pos ] ) {
return pos ;
}
}
// no starting pos
return NOT_FOUND ;
}
//! whether a word has been translated at a particular position
bool GetValue ( size_t pos ) const {
2015-07-16 13:56:20 +03:00
return bool ( m_bitmap [ pos ] ) ;
2011-02-24 16:14:42 +03:00
}
//! set value at a particular position
void SetValue ( size_t pos , bool value ) {
2015-10-26 12:50:27 +03:00
bool origValue = m_bitmap [ pos ] ;
if ( origValue = = value ) {
2015-10-26 15:03:54 +03:00
// do nothing
} else {
m_bitmap [ pos ] = value ;
UpdateFirstGap ( pos , pos , value ) ;
if ( value ) {
+ + m_numWordsCovered ;
} else {
- - m_numWordsCovered ;
}
2015-10-26 12:50:27 +03:00
}
2011-02-24 16:14:42 +03:00
}
2015-02-15 19:34:54 +03:00
2011-02-24 16:14:42 +03:00
//! whether every word has been translated
bool IsComplete ( ) const {
return GetSize ( ) = = GetNumWordsCovered ( ) ;
}
//! whether the wordrange overlaps with any translated word in this bitmap
2015-10-25 16:37:59 +03:00
bool Overlap ( const Range & compare ) const {
2011-02-24 16:14:42 +03:00
for ( size_t pos = compare . GetStartPos ( ) ; pos < = compare . GetEndPos ( ) ; pos + + ) {
if ( m_bitmap [ pos ] )
return true ;
}
return false ;
}
//! number of elements
size_t GetSize ( ) const {
2015-07-16 13:56:20 +03:00
return m_bitmap . size ( ) ;
2011-02-24 16:14:42 +03:00
}
inline size_t GetEdgeToTheLeftOf ( size_t l ) const {
if ( l = = 0 ) return l ;
while ( l & & ! m_bitmap [ l - 1 ] ) {
- - l ;
}
return l ;
}
inline size_t GetEdgeToTheRightOf ( size_t r ) const {
2015-07-16 13:56:20 +03:00
if ( r + 1 = = m_bitmap . size ( ) ) return r ;
return (
2015-07-17 02:00:42 +03:00
std : : find ( m_bitmap . begin ( ) + r + 1 , m_bitmap . end ( ) , true ) -
m_bitmap . begin ( )
) - 1 ;
2011-02-24 16:14:42 +03:00
}
//! converts bitmap into an integer ID: it consists of two parts: the first 16 bit are the pattern between the first gap and the last word-1, the second 16 bit are the number of filled positions. enforces a sentence length limit of 65535 and a max distortion of 16
WordsBitmapID GetID ( ) const {
2015-07-16 13:56:20 +03:00
assert ( m_bitmap . size ( ) < ( 1 < < 16 ) ) ;
2011-02-24 16:14:42 +03:00
size_t start = GetFirstGapPos ( ) ;
2015-07-16 13:56:20 +03:00
if ( start = = NOT_FOUND ) start = m_bitmap . size ( ) ; // nothing left
2011-02-24 16:14:42 +03:00
size_t end = GetLastPos ( ) ;
if ( end = = NOT_FOUND ) end = 0 ; // nothing translated yet
2013-11-19 22:52:15 +04:00
assert ( end < start | | end - start < = 16 ) ;
2011-02-24 16:14:42 +03:00
WordsBitmapID id = 0 ;
for ( size_t pos = end ; pos > start ; pos - - ) {
id = id * 2 + ( int ) GetValue ( pos ) ;
}
return id + ( 1 < < 16 ) * start ;
}
//! converts bitmap into an integer ID, with an additional span covered
WordsBitmapID GetIDPlus ( size_t startPos , size_t endPos ) const {
2015-07-16 13:56:20 +03:00
assert ( m_bitmap . size ( ) < ( 1 < < 16 ) ) ;
2011-02-24 16:14:42 +03:00
size_t start = GetFirstGapPos ( ) ;
2015-07-16 13:56:20 +03:00
if ( start = = NOT_FOUND ) start = m_bitmap . size ( ) ; // nothing left
2011-02-24 16:14:42 +03:00
size_t end = GetLastPos ( ) ;
if ( end = = NOT_FOUND ) end = 0 ; // nothing translated yet
if ( start = = startPos ) start = endPos + 1 ;
if ( end < endPos ) end = endPos ;
2013-11-19 22:52:15 +04:00
assert ( end < start | | end - start < = 16 ) ;
2011-02-24 16:14:42 +03:00
WordsBitmapID id = 0 ;
for ( size_t pos = end ; pos > start ; pos - - ) {
id = id * 2 ;
if ( GetValue ( pos ) | | ( startPos < = pos & & pos < = endPos ) )
id + + ;
}
return id + ( 1 < < 16 ) * start ;
}
2015-10-13 00:16:39 +03:00
// for unordered_set in stack
size_t hash ( ) const ;
2015-10-25 16:07:25 +03:00
bool operator = = ( const Bitmap & other ) const ;
bool operator ! = ( const Bitmap & other ) const {
2015-10-13 00:16:39 +03:00
return ! ( * this = = other ) ;
}
2011-02-24 16:14:42 +03:00
TO_STRING ( ) ;
2008-06-11 14:52:57 +04:00
} ;
2008-10-09 03:51:26 +04:00
}
2010-02-24 14:15:44 +03:00
# endif