2008-06-11 14:52:57 +04:00
|
|
|
// $Id$
|
|
|
|
// vim::tabstop=2
|
|
|
|
|
|
|
|
/***********************************************************************
|
|
|
|
Moses - factored phrase-based language decoder
|
|
|
|
Copyright (C) 2006 University of Edinburgh
|
|
|
|
|
|
|
|
This library is free software; you can redistribute it and/or
|
|
|
|
modify it under the terms of the GNU Lesser General Public
|
|
|
|
License as published by the Free Software Foundation; either
|
|
|
|
version 2.1 of the License, or (at your option) any later version.
|
|
|
|
|
|
|
|
This library is distributed in the hope that it will be useful,
|
|
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
|
|
Lesser General Public License for more details.
|
|
|
|
|
|
|
|
You should have received a copy of the GNU Lesser General Public
|
|
|
|
License along with this library; if not, write to the Free Software
|
|
|
|
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
|
|
***********************************************************************/
|
|
|
|
|
|
|
|
#include <sstream>
|
|
|
|
#include "memory.h"
|
|
|
|
#include "Word.h"
|
|
|
|
#include "TypeDef.h"
|
2013-07-04 18:13:29 +04:00
|
|
|
#include "FactorTypeSet.h"
|
2013-10-03 14:05:53 +04:00
|
|
|
#include "FactorCollection.h"
|
2008-06-11 14:52:57 +04:00
|
|
|
#include "StaticData.h" // needed to determine the FactorDelimiter
|
2013-03-28 14:27:11 +04:00
|
|
|
#include "util/exception.hh"
|
2012-10-14 20:35:58 +04:00
|
|
|
#include "util/tokenize_piece.hh"
|
2008-06-11 14:52:57 +04:00
|
|
|
|
|
|
|
using namespace std;
|
|
|
|
|
2008-10-09 03:51:26 +04:00
|
|
|
namespace Moses
|
|
|
|
{
|
2014-06-03 18:38:28 +04:00
|
|
|
|
|
|
|
// utility function for factorless decoding
|
|
|
|
size_t
|
|
|
|
max_fax()
|
|
|
|
{
|
|
|
|
if (StaticData::Instance().GetFactorDelimiter().size())
|
|
|
|
return MAX_NUM_FACTORS;
|
|
|
|
return 1;
|
|
|
|
}
|
2008-10-09 03:51:26 +04:00
|
|
|
|
2008-06-11 14:52:57 +04:00
|
|
|
// static
|
|
|
|
int Word::Compare(const Word &targetWord, const Word &sourceWord)
|
|
|
|
{
|
2011-02-24 16:14:42 +03:00
|
|
|
if (targetWord.IsNonTerminal() != sourceWord.IsNonTerminal()) {
|
|
|
|
return targetWord.IsNonTerminal() ? -1 : 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
for (size_t factorType = 0 ; factorType < MAX_NUM_FACTORS ; factorType++) {
|
2014-06-03 18:38:28 +04:00
|
|
|
const Factor *targetFactor = targetWord[factorType];
|
|
|
|
const Factor *sourceFactor = sourceWord[factorType];
|
2010-04-08 21:16:10 +04:00
|
|
|
|
2011-02-24 16:14:42 +03:00
|
|
|
if (targetFactor == NULL || sourceFactor == NULL)
|
|
|
|
continue;
|
|
|
|
if (targetFactor == sourceFactor)
|
|
|
|
continue;
|
2008-06-11 14:52:57 +04:00
|
|
|
|
2011-02-24 16:14:42 +03:00
|
|
|
return (targetFactor<sourceFactor) ? -1 : +1;
|
|
|
|
}
|
|
|
|
return 0;
|
2008-06-11 14:52:57 +04:00
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
void Word::Merge(const Word &sourceWord)
|
|
|
|
{
|
2011-02-24 16:14:42 +03:00
|
|
|
for (unsigned int currFactor = 0 ; currFactor < MAX_NUM_FACTORS ; currFactor++) {
|
|
|
|
const Factor *sourcefactor = sourceWord.m_factorArray[currFactor]
|
|
|
|
,*targetFactor = this ->m_factorArray[currFactor];
|
|
|
|
if (targetFactor == NULL && sourcefactor != NULL) {
|
|
|
|
m_factorArray[currFactor] = sourcefactor;
|
|
|
|
}
|
|
|
|
}
|
2008-06-11 14:52:57 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
std::string Word::GetString(const vector<FactorType> factorType,bool endWithBlank) const
|
|
|
|
{
|
2011-02-24 16:14:42 +03:00
|
|
|
stringstream strme;
|
|
|
|
const std::string& factorDelimiter = StaticData::Instance().GetFactorDelimiter();
|
|
|
|
bool firstPass = true;
|
2014-06-03 18:38:28 +04:00
|
|
|
unsigned int stop = min(max_fax(),factorType.size());
|
|
|
|
for (unsigned int i = 0 ; i < stop ; i++) {
|
|
|
|
UTIL_THROW_IF2(factorType[i] >= MAX_NUM_FACTORS,
|
|
|
|
"Trying to reference factor " << factorType[i]
|
|
|
|
<< ". Max factor is " << MAX_NUM_FACTORS);
|
2013-11-19 21:23:19 +04:00
|
|
|
|
2011-02-24 16:14:42 +03:00
|
|
|
const Factor *factor = m_factorArray[factorType[i]];
|
|
|
|
if (factor != NULL) {
|
|
|
|
if (firstPass) {
|
|
|
|
firstPass = false;
|
|
|
|
} else {
|
|
|
|
strme << factorDelimiter;
|
|
|
|
}
|
|
|
|
strme << factor->GetString();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if(endWithBlank) strme << " ";
|
|
|
|
return strme.str();
|
2008-06-11 14:52:57 +04:00
|
|
|
}
|
|
|
|
|
2013-05-29 21:16:15 +04:00
|
|
|
StringPiece Word::GetString(FactorType factorType) const
|
|
|
|
{
|
2013-04-25 22:42:30 +04:00
|
|
|
return m_factorArray[factorType]->GetString();
|
2012-01-20 19:35:55 +04:00
|
|
|
}
|
|
|
|
|
2013-03-28 14:27:11 +04:00
|
|
|
class StrayFactorException : public util::Exception {};
|
|
|
|
|
2014-04-08 15:43:21 +04:00
|
|
|
void
|
|
|
|
Word::
|
|
|
|
CreateFromString(FactorDirection direction
|
|
|
|
, const std::vector<FactorType> &factorOrder
|
|
|
|
, const StringPiece &str
|
2014-04-11 18:24:52 +04:00
|
|
|
, bool isNonTerminal
|
|
|
|
, bool strict)
|
2010-04-08 21:16:10 +04:00
|
|
|
{
|
2011-02-24 16:14:42 +03:00
|
|
|
FactorCollection &factorCollection = FactorCollection::Instance();
|
2014-04-11 18:24:52 +04:00
|
|
|
vector<StringPiece> bits(MAX_NUM_FACTORS);
|
2014-05-31 18:22:37 +04:00
|
|
|
string factorDelimiter = StaticData::Instance().GetFactorDelimiter();
|
|
|
|
if (factorDelimiter.size())
|
|
|
|
{
|
|
|
|
util::TokenIter<util::MultiCharacter> fit(str, factorDelimiter);
|
|
|
|
size_t i = 0;
|
|
|
|
for (; i < MAX_NUM_FACTORS && fit; ++i,++fit)
|
|
|
|
bits[i] = *fit;
|
|
|
|
if (i == MAX_NUM_FACTORS)
|
|
|
|
UTIL_THROW_IF(fit, StrayFactorException,
|
|
|
|
"The hard limit for factors is " << MAX_NUM_FACTORS
|
|
|
|
<< ". The word " << str << " contains factor delimiter "
|
|
|
|
<< StaticData::Instance().GetFactorDelimiter()
|
|
|
|
<< " too many times.");
|
|
|
|
if (strict)
|
|
|
|
UTIL_THROW_IF(fit, StrayFactorException,
|
|
|
|
"You have configured " << factorOrder.size()
|
|
|
|
<< " factors but the word " << str
|
|
|
|
<< " contains factor delimiter "
|
|
|
|
<< StaticData::Instance().GetFactorDelimiter()
|
|
|
|
<< " too many times.");
|
2014-07-11 19:26:48 +04:00
|
|
|
UTIL_THROW_IF(!isNonTerminal && i < factorOrder.size(),util::Exception,
|
2014-05-31 18:22:37 +04:00
|
|
|
"Too few factors in string '" << str << "'.");
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
bits[0] = str;
|
|
|
|
}
|
2014-04-11 18:24:52 +04:00
|
|
|
for (size_t k = 0; k < factorOrder.size(); ++k)
|
|
|
|
{
|
|
|
|
UTIL_THROW_IF(factorOrder[k] >= MAX_NUM_FACTORS, util::Exception,
|
|
|
|
"Factor order out of bounds.");
|
2014-04-13 01:10:40 +04:00
|
|
|
m_factorArray[factorOrder[k]] = factorCollection.AddFactor(bits[k], isNonTerminal);
|
2014-04-11 18:24:52 +04:00
|
|
|
}
|
2011-02-24 16:14:42 +03:00
|
|
|
// assume term/non-term same for all factors
|
|
|
|
m_isNonTerminal = isNonTerminal;
|
2010-04-08 21:16:10 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
void Word::CreateUnknownWord(const Word &sourceWord)
|
|
|
|
{
|
2011-02-24 16:14:42 +03:00
|
|
|
FactorCollection &factorCollection = FactorCollection::Instance();
|
|
|
|
|
2014-03-21 14:53:15 +04:00
|
|
|
m_isNonTerminal = sourceWord.IsNonTerminal();
|
|
|
|
|
2014-06-03 18:38:28 +04:00
|
|
|
// const std::string& factorDelimiter = StaticData::Instance().GetFactorDelimiter();
|
|
|
|
unsigned int stop = max_fax();
|
|
|
|
for (unsigned int currFactor = 0 ; currFactor < stop; currFactor++) {
|
2011-02-24 16:14:42 +03:00
|
|
|
FactorType factorType = static_cast<FactorType>(currFactor);
|
|
|
|
|
|
|
|
const Factor *sourceFactor = sourceWord[currFactor];
|
|
|
|
if (sourceFactor == NULL)
|
2014-03-21 14:53:15 +04:00
|
|
|
SetFactor(factorType, factorCollection.AddFactor(Output, factorType, UNKNOWN_FACTOR, m_isNonTerminal));
|
2011-02-24 16:14:42 +03:00
|
|
|
else
|
2014-03-21 14:53:15 +04:00
|
|
|
SetFactor(factorType, factorCollection.AddFactor(Output, factorType, sourceFactor->GetString(), m_isNonTerminal));
|
2011-02-24 16:14:42 +03:00
|
|
|
}
|
2014-03-21 14:53:15 +04:00
|
|
|
|
2013-09-18 17:47:49 +04:00
|
|
|
m_isOOV = true;
|
2010-04-08 21:16:10 +04:00
|
|
|
}
|
2011-02-24 16:14:42 +03:00
|
|
|
|
2013-07-04 18:13:29 +04:00
|
|
|
void Word::OnlyTheseFactors(const FactorMask &factors)
|
|
|
|
{
|
|
|
|
for (unsigned int currFactor = 0 ; currFactor < MAX_NUM_FACTORS ; currFactor++) {
|
|
|
|
if (!factors[currFactor]) {
|
|
|
|
SetFactor(currFactor, NULL);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2013-10-04 17:18:11 +04:00
|
|
|
bool Word::IsEpsilon() const
|
|
|
|
{
|
|
|
|
const Factor *factor = m_factorArray[0];
|
|
|
|
int compare = factor->GetString().compare(EPSILON);
|
|
|
|
|
|
|
|
return compare == 0;
|
|
|
|
}
|
|
|
|
|
2008-06-11 14:52:57 +04:00
|
|
|
TO_STRING_BODY(Word);
|
|
|
|
|
|
|
|
// friend
|
|
|
|
ostream& operator<<(ostream& out, const Word& word)
|
2011-02-24 16:14:42 +03:00
|
|
|
{
|
|
|
|
stringstream strme;
|
|
|
|
const std::string& factorDelimiter = StaticData::Instance().GetFactorDelimiter();
|
|
|
|
bool firstPass = true;
|
2014-06-03 18:38:28 +04:00
|
|
|
unsigned int stop = max_fax();
|
|
|
|
for (unsigned int currFactor = 0 ; currFactor < stop; currFactor++) {
|
2011-02-24 16:14:42 +03:00
|
|
|
FactorType factorType = static_cast<FactorType>(currFactor);
|
|
|
|
const Factor *factor = word.GetFactor(factorType);
|
|
|
|
if (factor != NULL) {
|
|
|
|
if (firstPass) {
|
|
|
|
firstPass = false;
|
|
|
|
} else {
|
|
|
|
strme << factorDelimiter;
|
|
|
|
}
|
|
|
|
strme << *factor;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
out << strme.str() << " ";
|
|
|
|
return out;
|
2008-06-11 14:52:57 +04:00
|
|
|
}
|
2008-10-09 03:51:26 +04:00
|
|
|
|
|
|
|
}
|
|
|
|
|