2011-02-24 16:14:42 +03:00
|
|
|
// $Id$
|
2008-06-11 14:52:57 +04:00
|
|
|
// vim:tabstop=2
|
|
|
|
|
|
|
|
/***********************************************************************
|
|
|
|
Moses - factored phrase-based language decoder
|
|
|
|
Copyright (C) 2006 University of Edinburgh
|
|
|
|
|
|
|
|
This library is free software; you can redistribute it and/or
|
|
|
|
modify it under the terms of the GNU Lesser General Public
|
|
|
|
License as published by the Free Software Foundation; either
|
|
|
|
version 2.1 of the License, or (at your option) any later version.
|
|
|
|
|
|
|
|
This library is distributed in the hope that it will be useful,
|
|
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
|
|
Lesser General Public License for more details.
|
|
|
|
|
|
|
|
You should have received a copy of the GNU Lesser General Public
|
|
|
|
License along with this library; if not, write to the Free Software
|
|
|
|
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
|
|
***********************************************************************/
|
|
|
|
|
2011-11-18 16:07:41 +04:00
|
|
|
#include "util/check.hh"
|
2008-06-11 14:52:57 +04:00
|
|
|
#include <algorithm>
|
|
|
|
#include <sstream>
|
|
|
|
#include <string>
|
|
|
|
#include "memory.h"
|
|
|
|
#include "FactorCollection.h"
|
|
|
|
#include "Phrase.h"
|
|
|
|
#include "StaticData.h" // GetMaxNumFactors
|
|
|
|
|
2011-10-14 20:40:30 +04:00
|
|
|
#include "util/string_piece.hh"
|
|
|
|
#include "util/tokenize_piece.hh"
|
|
|
|
|
2008-06-11 14:52:57 +04:00
|
|
|
using namespace std;
|
|
|
|
|
2008-10-09 03:51:26 +04:00
|
|
|
namespace Moses
|
|
|
|
{
|
2008-06-11 14:52:57 +04:00
|
|
|
|
2012-10-22 21:17:46 +04:00
|
|
|
Phrase::Phrase() {}
|
|
|
|
|
2011-11-21 14:49:26 +04:00
|
|
|
Phrase::Phrase(size_t reserveSize)
|
2008-06-11 14:52:57 +04:00
|
|
|
{
|
2011-02-24 16:14:42 +03:00
|
|
|
m_words.reserve(reserveSize);
|
2008-06-11 14:52:57 +04:00
|
|
|
}
|
|
|
|
|
2011-11-21 14:49:26 +04:00
|
|
|
Phrase::Phrase(const vector< const Word* > &mergeWords)
|
2008-06-11 14:52:57 +04:00
|
|
|
{
|
2011-02-24 16:14:42 +03:00
|
|
|
m_words.reserve(mergeWords.size());
|
|
|
|
for (size_t currPos = 0 ; currPos < mergeWords.size() ; currPos++) {
|
|
|
|
AddWord(*mergeWords[currPos]);
|
|
|
|
}
|
2008-06-11 14:52:57 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
Phrase::~Phrase()
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
void Phrase::MergeFactors(const Phrase ©)
|
|
|
|
{
|
2011-11-18 16:07:41 +04:00
|
|
|
CHECK(GetSize() == copy.GetSize());
|
2011-02-24 16:14:42 +03:00
|
|
|
size_t size = GetSize();
|
2011-11-21 14:49:26 +04:00
|
|
|
const size_t maxNumFactors = MAX_NUM_FACTORS;
|
2011-02-24 16:14:42 +03:00
|
|
|
for (size_t currPos = 0 ; currPos < size ; currPos++) {
|
|
|
|
for (unsigned int currFactor = 0 ; currFactor < maxNumFactors ; currFactor++) {
|
|
|
|
FactorType factorType = static_cast<FactorType>(currFactor);
|
|
|
|
const Factor *factor = copy.GetFactor(currPos, factorType);
|
|
|
|
if (factor != NULL)
|
|
|
|
SetFactor(currPos, factorType, factor);
|
|
|
|
}
|
|
|
|
}
|
2008-06-11 14:52:57 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
void Phrase::MergeFactors(const Phrase ©, FactorType factorType)
|
|
|
|
{
|
2011-11-18 16:07:41 +04:00
|
|
|
CHECK(GetSize() == copy.GetSize());
|
2011-02-24 16:14:42 +03:00
|
|
|
for (size_t currPos = 0 ; currPos < GetSize() ; currPos++)
|
|
|
|
SetFactor(currPos, factorType, copy.GetFactor(currPos, factorType));
|
2008-06-11 14:52:57 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
void Phrase::MergeFactors(const Phrase ©, const std::vector<FactorType>& factorVec)
|
|
|
|
{
|
2011-11-18 16:07:41 +04:00
|
|
|
CHECK(GetSize() == copy.GetSize());
|
2011-02-24 16:14:42 +03:00
|
|
|
for (size_t currPos = 0 ; currPos < GetSize() ; currPos++)
|
|
|
|
for (std::vector<FactorType>::const_iterator i = factorVec.begin();
|
|
|
|
i != factorVec.end(); ++i) {
|
|
|
|
SetFactor(currPos, *i, copy.GetFactor(currPos, *i));
|
|
|
|
}
|
2008-06-11 14:52:57 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
Phrase Phrase::GetSubString(const WordsRange &wordsRange) const
|
|
|
|
{
|
2011-11-21 14:49:26 +04:00
|
|
|
Phrase retPhrase(wordsRange.GetNumWordsCovered());
|
2008-06-11 14:52:57 +04:00
|
|
|
|
2011-02-24 16:14:42 +03:00
|
|
|
for (size_t currPos = wordsRange.GetStartPos() ; currPos <= wordsRange.GetEndPos() ; currPos++) {
|
|
|
|
Word &word = retPhrase.AddWord();
|
|
|
|
word = GetWord(currPos);
|
|
|
|
}
|
2008-06-11 14:52:57 +04:00
|
|
|
|
2011-02-24 16:14:42 +03:00
|
|
|
return retPhrase;
|
2008-06-11 14:52:57 +04:00
|
|
|
}
|
|
|
|
|
2011-06-11 02:45:17 +04:00
|
|
|
Phrase Phrase::GetSubString(const WordsRange &wordsRange, FactorType factorType) const
|
|
|
|
{
|
2011-12-09 13:30:48 +04:00
|
|
|
Phrase retPhrase(wordsRange.GetNumWordsCovered());
|
2011-06-11 02:45:17 +04:00
|
|
|
|
|
|
|
for (size_t currPos = wordsRange.GetStartPos() ; currPos <= wordsRange.GetEndPos() ; currPos++)
|
|
|
|
{
|
|
|
|
const Factor* f = GetFactor(currPos, factorType);
|
|
|
|
Word &word = retPhrase.AddWord();
|
|
|
|
word.SetFactor(factorType, f);
|
|
|
|
}
|
|
|
|
|
|
|
|
return retPhrase;
|
|
|
|
}
|
|
|
|
|
2008-06-11 14:52:57 +04:00
|
|
|
std::string Phrase::GetStringRep(const vector<FactorType> factorsToPrint) const
|
|
|
|
{
|
2011-02-24 16:14:42 +03:00
|
|
|
stringstream strme;
|
|
|
|
for (size_t pos = 0 ; pos < GetSize() ; pos++) {
|
|
|
|
strme << GetWord(pos).GetString(factorsToPrint, (pos != GetSize()-1));
|
|
|
|
}
|
2008-06-11 14:52:57 +04:00
|
|
|
|
2011-02-24 16:14:42 +03:00
|
|
|
return strme.str();
|
2008-06-11 14:52:57 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
Word &Phrase::AddWord()
|
|
|
|
{
|
2011-02-24 16:14:42 +03:00
|
|
|
m_words.push_back(Word());
|
|
|
|
return m_words.back();
|
2008-06-11 14:52:57 +04:00
|
|
|
}
|
|
|
|
|
2011-02-24 16:14:42 +03:00
|
|
|
void Phrase::Append(const Phrase &endPhrase)
|
|
|
|
{
|
|
|
|
|
|
|
|
for (size_t i = 0; i < endPhrase.GetSize(); i++) {
|
|
|
|
AddWord(endPhrase.GetWord(i));
|
|
|
|
}
|
2008-06-11 14:52:57 +04:00
|
|
|
}
|
|
|
|
|
2010-04-08 21:16:10 +04:00
|
|
|
void Phrase::PrependWord(const Word &newWord)
|
|
|
|
{
|
2011-02-24 16:14:42 +03:00
|
|
|
AddWord();
|
|
|
|
|
|
|
|
// shift
|
|
|
|
for (size_t pos = GetSize() - 1; pos >= 1; --pos) {
|
|
|
|
const Word &word = m_words[pos - 1];
|
|
|
|
m_words[pos] = word;
|
|
|
|
}
|
|
|
|
|
|
|
|
m_words[0] = newWord;
|
2010-04-08 21:16:10 +04:00
|
|
|
}
|
2011-02-24 16:14:42 +03:00
|
|
|
|
2013-03-08 23:10:28 +04:00
|
|
|
void Phrase::CreateFromString(FactorDirection direction
|
|
|
|
,const std::vector<FactorType> &factorOrder
|
|
|
|
,const StringPiece &phraseString
|
|
|
|
,const StringPiece &factorDelimiter
|
|
|
|
,Word *lhs)
|
2008-06-11 14:52:57 +04:00
|
|
|
{
|
2013-03-09 01:53:04 +04:00
|
|
|
// parse
|
|
|
|
vector<StringPiece> annotatedWordVector;
|
|
|
|
for (util::TokenIter<util::AnyCharacter, true> it(phraseString, "\t "); it; ++it) {
|
|
|
|
annotatedWordVector.push_back(*it);
|
|
|
|
}
|
2011-02-24 16:14:42 +03:00
|
|
|
|
2013-03-09 01:53:04 +04:00
|
|
|
if (annotatedWordVector.size() == 0)
|
|
|
|
return;
|
|
|
|
|
|
|
|
// KOMMA|none ART|Def.Z NN|Neut.NotGen.Sg VVFIN|none
|
|
|
|
// to
|
|
|
|
// "KOMMA|none" "ART|Def.Z" "NN|Neut.NotGen.Sg" "VVFIN|none"
|
|
|
|
|
|
|
|
size_t numWords;
|
|
|
|
const StringPiece &annotatedWord = annotatedWordVector.back();
|
|
|
|
if (annotatedWord.size() >= 2
|
|
|
|
&& *annotatedWord.data() == '['
|
|
|
|
&& annotatedWord.data()[annotatedWord.size() - 1] == ']') {
|
|
|
|
// hiero/syntax rule
|
|
|
|
numWords = annotatedWordVector.size()-1;
|
|
|
|
|
|
|
|
// lhs
|
|
|
|
CHECK(lhs);
|
|
|
|
lhs->CreateFromString(direction, factorOrder, annotatedWord.substr(1, annotatedWord.size() - 2), true);
|
|
|
|
assert(lhs->IsNonTerminal());
|
|
|
|
}
|
|
|
|
else {
|
2013-04-24 22:23:14 +04:00
|
|
|
//CHECK(lhs == NULL);
|
2013-03-09 01:53:04 +04:00
|
|
|
|
|
|
|
numWords = annotatedWordVector.size();
|
|
|
|
}
|
|
|
|
|
|
|
|
// parse each word
|
|
|
|
m_words.reserve(numWords);
|
|
|
|
|
|
|
|
for (size_t phrasePos = 0 ; phrasePos < numWords; phrasePos++) {
|
|
|
|
StringPiece &annotatedWord = annotatedWordVector[phrasePos];
|
|
|
|
bool isNonTerminal;
|
|
|
|
if (annotatedWord.size() >= 2 && *annotatedWord.data() == '[' && annotatedWord.data()[annotatedWord.size() - 1] == ']') {
|
|
|
|
// non-term
|
|
|
|
isNonTerminal = true;
|
|
|
|
|
|
|
|
size_t nextPos = annotatedWord.find('[', 1);
|
|
|
|
CHECK(nextPos != string::npos);
|
|
|
|
|
|
|
|
if (direction == Input)
|
|
|
|
annotatedWord = annotatedWord.substr(1, nextPos - 2);
|
|
|
|
else
|
|
|
|
annotatedWord = annotatedWord.substr(nextPos + 1, annotatedWord.size() - nextPos - 2);
|
|
|
|
} else {
|
|
|
|
isNonTerminal = false;
|
2011-02-24 16:14:42 +03:00
|
|
|
}
|
2013-03-09 01:53:04 +04:00
|
|
|
|
|
|
|
Word &word = AddWord();
|
|
|
|
word.CreateFromString(direction, factorOrder, annotatedWord, isNonTerminal);
|
|
|
|
|
2011-02-24 16:14:42 +03:00
|
|
|
}
|
2008-06-11 14:52:57 +04:00
|
|
|
}
|
|
|
|
|
2010-04-08 21:16:10 +04:00
|
|
|
int Phrase::Compare(const Phrase &other) const
|
|
|
|
{
|
2008-06-11 14:52:57 +04:00
|
|
|
#ifdef min
|
|
|
|
#undef min
|
|
|
|
#endif
|
2011-02-24 16:14:42 +03:00
|
|
|
size_t thisSize = GetSize()
|
|
|
|
,compareSize = other.GetSize();
|
|
|
|
if (thisSize != compareSize) {
|
|
|
|
return (thisSize < compareSize) ? -1 : 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
for (size_t pos = 0 ; pos < thisSize ; pos++) {
|
|
|
|
const Word &thisWord = GetWord(pos)
|
|
|
|
,&otherWord = other.GetWord(pos);
|
|
|
|
int ret = Word::Compare(thisWord, otherWord);
|
|
|
|
|
|
|
|
if (ret != 0)
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
2010-02-03 13:23:32 +03:00
|
|
|
}
|
2008-06-11 14:52:57 +04:00
|
|
|
|
2011-02-24 16:14:42 +03:00
|
|
|
|
2008-06-11 14:52:57 +04:00
|
|
|
bool Phrase::Contains(const vector< vector<string> > &subPhraseVector
|
2011-02-24 16:14:42 +03:00
|
|
|
, const vector<FactorType> &inputFactor) const
|
2008-06-11 14:52:57 +04:00
|
|
|
{
|
2011-02-24 16:14:42 +03:00
|
|
|
const size_t subSize = subPhraseVector.size()
|
|
|
|
,thisSize= GetSize();
|
|
|
|
if (subSize > thisSize)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
// try to match word-for-word
|
|
|
|
for (size_t currStartPos = 0 ; currStartPos < (thisSize - subSize + 1) ; currStartPos++) {
|
|
|
|
bool match = true;
|
|
|
|
|
|
|
|
for (size_t currFactorIndex = 0 ; currFactorIndex < inputFactor.size() ; currFactorIndex++) {
|
|
|
|
FactorType factorType = inputFactor[currFactorIndex];
|
|
|
|
for (size_t currSubPos = 0 ; currSubPos < subSize ; currSubPos++) {
|
|
|
|
size_t currThisPos = currSubPos + currStartPos;
|
2013-04-25 22:42:30 +04:00
|
|
|
const string &subStr = subPhraseVector[currSubPos][currFactorIndex];
|
2013-04-29 21:46:48 +04:00
|
|
|
StringPiece thisStr = GetFactor(currThisPos, factorType)->GetString();
|
2011-02-24 16:14:42 +03:00
|
|
|
if (subStr != thisStr) {
|
|
|
|
match = false;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (!match)
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (match)
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
return false;
|
2008-06-11 14:52:57 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
bool Phrase::IsCompatible(const Phrase &inputPhrase) const
|
|
|
|
{
|
2011-02-24 16:14:42 +03:00
|
|
|
if (inputPhrase.GetSize() != GetSize()) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
const size_t size = GetSize();
|
|
|
|
|
2011-11-21 14:49:26 +04:00
|
|
|
const size_t maxNumFactors = MAX_NUM_FACTORS;
|
2011-02-24 16:14:42 +03:00
|
|
|
for (size_t currPos = 0 ; currPos < size ; currPos++) {
|
|
|
|
for (unsigned int currFactor = 0 ; currFactor < maxNumFactors ; currFactor++) {
|
|
|
|
FactorType factorType = static_cast<FactorType>(currFactor);
|
|
|
|
const Factor *thisFactor = GetFactor(currPos, factorType)
|
|
|
|
,*inputFactor = inputPhrase.GetFactor(currPos, factorType);
|
|
|
|
if (thisFactor != NULL && inputFactor != NULL && thisFactor != inputFactor)
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return true;
|
2008-06-11 14:52:57 +04:00
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
bool Phrase::IsCompatible(const Phrase &inputPhrase, FactorType factorType) const
|
|
|
|
{
|
2011-02-24 16:14:42 +03:00
|
|
|
if (inputPhrase.GetSize() != GetSize()) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
for (size_t currPos = 0 ; currPos < GetSize() ; currPos++) {
|
|
|
|
if (GetFactor(currPos, factorType) != inputPhrase.GetFactor(currPos, factorType))
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
return true;
|
2008-06-11 14:52:57 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
bool Phrase::IsCompatible(const Phrase &inputPhrase, const std::vector<FactorType>& factorVec) const
|
|
|
|
{
|
2011-02-24 16:14:42 +03:00
|
|
|
if (inputPhrase.GetSize() != GetSize()) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
for (size_t currPos = 0 ; currPos < GetSize() ; currPos++) {
|
|
|
|
for (std::vector<FactorType>::const_iterator i = factorVec.begin();
|
|
|
|
i != factorVec.end(); ++i) {
|
|
|
|
if (GetFactor(currPos, *i) != inputPhrase.GetFactor(currPos, *i))
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return true;
|
2008-06-11 14:52:57 +04:00
|
|
|
}
|
|
|
|
|
2010-04-08 21:16:10 +04:00
|
|
|
size_t Phrase::GetNumTerminals() const
|
|
|
|
{
|
2011-02-24 16:14:42 +03:00
|
|
|
size_t ret = 0;
|
|
|
|
|
|
|
|
for (size_t pos = 0; pos < GetSize(); ++pos) {
|
|
|
|
if (!GetWord(pos).IsNonTerminal())
|
|
|
|
ret++;
|
|
|
|
}
|
|
|
|
return ret;
|
2010-04-08 21:16:10 +04:00
|
|
|
}
|
2011-02-24 16:14:42 +03:00
|
|
|
|
2008-06-11 14:52:57 +04:00
|
|
|
void Phrase::InitializeMemPool()
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
void Phrase::FinalizeMemPool()
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
2013-04-26 15:20:49 +04:00
|
|
|
void Phrase::OnlyTheseFactors(const FactorMask &factors)
|
|
|
|
{
|
|
|
|
for (unsigned int currFactor = 0 ; currFactor < MAX_NUM_FACTORS ; currFactor++) {
|
|
|
|
if (!factors[currFactor]) {
|
|
|
|
for (size_t pos = 0; pos < GetSize(); ++pos) {
|
|
|
|
SetFactor(pos, currFactor, NULL);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2008-06-11 14:52:57 +04:00
|
|
|
TO_STRING_BODY(Phrase);
|
|
|
|
|
|
|
|
// friend
|
|
|
|
ostream& operator<<(ostream& out, const Phrase& phrase)
|
|
|
|
{
|
|
|
|
// out << "(size " << phrase.GetSize() << ") ";
|
2011-02-24 16:14:42 +03:00
|
|
|
for (size_t pos = 0 ; pos < phrase.GetSize() ; pos++) {
|
|
|
|
const Word &word = phrase.GetWord(pos);
|
|
|
|
out << word;
|
|
|
|
}
|
|
|
|
return out;
|
2008-06-11 14:52:57 +04:00
|
|
|
}
|
|
|
|
|
2008-10-09 03:51:26 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
|