2011-10-17 15:37:23 +04:00
|
|
|
// $Id: PhraseDictionaryMemory.cpp 4365 2011-10-14 16:40:30Z heafield $
|
2008-06-11 14:52:57 +04:00
|
|
|
// vim:tabstop=2
|
|
|
|
|
|
|
|
/***********************************************************************
|
|
|
|
Moses - factored phrase-based language decoder
|
|
|
|
Copyright (C) 2006 University of Edinburgh
|
|
|
|
|
|
|
|
This library is free software; you can redistribute it and/or
|
|
|
|
modify it under the terms of the GNU Lesser General Public
|
|
|
|
License as published by the Free Software Foundation; either
|
|
|
|
version 2.1 of the License, or (at your option) any later version.
|
|
|
|
|
|
|
|
This library is distributed in the hope that it will be useful,
|
|
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
|
|
Lesser General Public License for more details.
|
|
|
|
|
|
|
|
You should have received a copy of the GNU Lesser General Public
|
|
|
|
License along with this library; if not, write to the Free Software
|
|
|
|
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
|
|
***********************************************************************/
|
|
|
|
|
|
|
|
#include <fstream>
|
|
|
|
#include <string>
|
|
|
|
#include <iterator>
|
|
|
|
#include <algorithm>
|
2011-10-17 15:37:23 +04:00
|
|
|
#include <memory>
|
2008-06-11 14:52:57 +04:00
|
|
|
#include <sys/stat.h>
|
2011-10-14 20:40:30 +04:00
|
|
|
#include <stdlib.h>
|
|
|
|
#include "util/file_piece.hh"
|
|
|
|
#include "util/tokenize_piece.hh"
|
|
|
|
|
2008-06-11 14:52:57 +04:00
|
|
|
#include "PhraseDictionaryMemory.h"
|
|
|
|
#include "FactorCollection.h"
|
|
|
|
#include "Word.h"
|
|
|
|
#include "Util.h"
|
|
|
|
#include "InputFileStream.h"
|
|
|
|
#include "StaticData.h"
|
|
|
|
#include "WordsRange.h"
|
|
|
|
#include "UserMessage.h"
|
2011-09-20 19:32:26 +04:00
|
|
|
#include "SparsePhraseDictionaryFeature.h"
|
2008-06-11 14:52:57 +04:00
|
|
|
|
|
|
|
using namespace std;
|
|
|
|
|
2008-10-09 03:51:26 +04:00
|
|
|
namespace Moses
|
|
|
|
{
|
2011-10-14 20:40:30 +04:00
|
|
|
|
|
|
|
namespace {
|
|
|
|
void ParserDeath(const std::string &file, size_t line_num) {
|
|
|
|
stringstream strme;
|
|
|
|
strme << "Syntax error at " << file << ":" << line_num;
|
|
|
|
UserMessage::Add(strme.str());
|
|
|
|
abort();
|
|
|
|
}
|
|
|
|
template <class It> StringPiece GrabOrDie(It &it, const std::string &file, size_t line_num) {
|
|
|
|
if (!it) ParserDeath(file, line_num);
|
|
|
|
return *it++;
|
|
|
|
}
|
|
|
|
} // namespace
|
|
|
|
|
2008-06-11 14:52:57 +04:00
|
|
|
bool PhraseDictionaryMemory::Load(const std::vector<FactorType> &input
|
2011-02-24 16:14:42 +03:00
|
|
|
, const std::vector<FactorType> &output
|
|
|
|
, const string &filePath
|
|
|
|
, const vector<float> &weight
|
|
|
|
, size_t tableLimit
|
|
|
|
, const LMList &languageModels
|
|
|
|
, float weightWP)
|
2008-06-11 14:52:57 +04:00
|
|
|
{
|
2012-07-02 18:57:54 +04:00
|
|
|
const_cast<LMList&>(languageModels).InitializeBeforeSentenceProcessing();
|
|
|
|
|
2011-02-24 16:14:42 +03:00
|
|
|
const StaticData &staticData = StaticData::Instance();
|
2008-06-11 14:52:57 +04:00
|
|
|
|
2011-02-24 16:14:42 +03:00
|
|
|
m_tableLimit = tableLimit;
|
2008-06-11 14:52:57 +04:00
|
|
|
|
2011-10-14 20:40:30 +04:00
|
|
|
util::FilePiece inFile(filePath.c_str(), staticData.GetVerboseLevel() >= 1 ? &std::cerr : NULL);
|
2008-06-11 14:52:57 +04:00
|
|
|
|
|
|
|
size_t line_num = 0;
|
|
|
|
size_t numElement = NOT_FOUND; // 3=old format, 5=async format which include word alignment info
|
2011-10-14 20:40:30 +04:00
|
|
|
const std::string& factorDelimiter = staticData.GetFactorDelimiter();
|
2011-02-24 16:14:42 +03:00
|
|
|
|
2011-11-21 14:49:26 +04:00
|
|
|
Phrase sourcePhrase(0);
|
2011-10-17 15:37:23 +04:00
|
|
|
std::vector<float> scv;
|
|
|
|
scv.reserve(m_numScoreComponent);
|
2011-02-24 16:14:42 +03:00
|
|
|
|
2011-10-17 15:37:23 +04:00
|
|
|
TargetPhraseCollection *preSourceNode = NULL;
|
|
|
|
std::string preSourceString;
|
2011-02-24 16:14:42 +03:00
|
|
|
|
2011-10-14 20:40:30 +04:00
|
|
|
while(true) {
|
2011-02-24 16:14:42 +03:00
|
|
|
++line_num;
|
2011-10-14 20:40:30 +04:00
|
|
|
StringPiece line;
|
|
|
|
try {
|
|
|
|
line = inFile.ReadLine();
|
|
|
|
} catch (util::EndOfFileException &e) {
|
|
|
|
break;
|
2011-02-24 16:14:42 +03:00
|
|
|
}
|
|
|
|
|
2011-10-14 20:40:30 +04:00
|
|
|
util::TokenIter<util::MultiCharacter> pipes(line, util::MultiCharacter("|||"));
|
|
|
|
StringPiece sourcePhraseString(GrabOrDie(pipes, filePath, line_num));
|
|
|
|
StringPiece targetPhraseString(GrabOrDie(pipes, filePath, line_num));
|
|
|
|
StringPiece scoreString(GrabOrDie(pipes, filePath, line_num));
|
2011-02-24 16:14:42 +03:00
|
|
|
|
2011-10-14 20:40:30 +04:00
|
|
|
bool isLHSEmpty = !util::TokenIter<util::AnyCharacter, true>(sourcePhraseString, util::AnyCharacter(" \t"));
|
2011-02-24 16:14:42 +03:00
|
|
|
if (isLHSEmpty && !staticData.IsWordDeletionEnabled()) {
|
2011-10-14 20:40:30 +04:00
|
|
|
TRACE_ERR( filePath << ":" << line_num << ": pt entry contains empty source, skipping\n");
|
2011-02-24 16:14:42 +03:00
|
|
|
continue;
|
|
|
|
}
|
2011-10-14 20:40:30 +04:00
|
|
|
|
|
|
|
//target
|
2012-10-22 20:40:23 +04:00
|
|
|
std::auto_ptr<TargetPhrase> targetPhrase(new TargetPhrase());
|
2011-10-17 15:37:23 +04:00
|
|
|
targetPhrase->CreateFromString(output, targetPhraseString, factorDelimiter);
|
2011-10-14 20:40:30 +04:00
|
|
|
|
2011-10-17 15:37:23 +04:00
|
|
|
scv.clear();
|
2011-10-14 20:40:30 +04:00
|
|
|
for (util::TokenIter<util::AnyCharacter, true> token(scoreString, util::AnyCharacter(" \t")); token; ++token) {
|
|
|
|
char *err_ind;
|
|
|
|
// Token is always delimited by some form of space. Also, apparently strtod is portable but strtof isn't.
|
|
|
|
scv.push_back(FloorScore(TransformScore(static_cast<float>(strtod(token->data(), &err_ind)))));
|
|
|
|
if (err_ind == token->data()) {
|
|
|
|
stringstream strme;
|
|
|
|
strme << "Bad number " << token << " on line " << line_num;
|
|
|
|
UserMessage::Add(strme.str());
|
|
|
|
abort();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (scv.size() != m_numScoreComponent) {
|
2011-02-24 16:14:42 +03:00
|
|
|
stringstream strme;
|
2011-10-14 20:40:30 +04:00
|
|
|
strme << "Size of scoreVector != number (" <<scv.size() << "!=" <<m_numScoreComponent<<") of score components on line " << line_num;
|
2011-02-24 16:14:42 +03:00
|
|
|
UserMessage::Add(strme.str());
|
|
|
|
abort();
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2011-10-28 18:54:23 +04:00
|
|
|
|
2011-10-14 20:40:30 +04:00
|
|
|
size_t consumed = 3;
|
|
|
|
if (pipes) {
|
2011-10-17 15:37:23 +04:00
|
|
|
targetPhrase->SetAlignmentInfo(*pipes++);
|
2011-10-14 20:40:30 +04:00
|
|
|
++consumed;
|
|
|
|
}
|
2011-02-24 16:14:42 +03:00
|
|
|
|
2011-09-20 19:32:26 +04:00
|
|
|
ScoreComponentCollection sparse;
|
2011-10-28 18:54:23 +04:00
|
|
|
if (pipes) pipes++; //counts
|
|
|
|
if (pipes) {
|
2011-09-20 19:32:26 +04:00
|
|
|
//sparse features
|
|
|
|
SparsePhraseDictionaryFeature* spdf =
|
|
|
|
GetFeature()->GetSparsePhraseDictionaryFeature();
|
|
|
|
if (spdf) {
|
2011-10-28 18:54:23 +04:00
|
|
|
sparse.Assign(spdf,(pipes++)->as_string());
|
2011-09-20 19:32:26 +04:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2011-02-24 16:14:42 +03:00
|
|
|
|
2011-10-28 18:54:23 +04:00
|
|
|
// scv good to go sir!
|
|
|
|
targetPhrase->SetScore(m_feature, scv, sparse, weight, weightWP, languageModels);
|
|
|
|
|
2011-10-14 20:40:30 +04:00
|
|
|
// Check number of entries delimited by ||| agrees across all lines.
|
|
|
|
for (; pipes; ++pipes, ++consumed) {}
|
|
|
|
if (numElement != consumed) {
|
|
|
|
if (numElement == NOT_FOUND) {
|
|
|
|
numElement = consumed;
|
|
|
|
} else {
|
|
|
|
stringstream strme;
|
|
|
|
strme << "Syntax error at " << filePath << ":" << line_num;
|
|
|
|
UserMessage::Add(strme.str());
|
|
|
|
abort();
|
|
|
|
}
|
|
|
|
}
|
2011-02-24 16:14:42 +03:00
|
|
|
|
2012-09-26 17:40:06 +04:00
|
|
|
//TODO: Would be better to reuse source phrases, but ownership has to be
|
|
|
|
//consistent across phrase table implementations
|
|
|
|
sourcePhrase.Clear();
|
|
|
|
sourcePhrase.CreateFromString(input, sourcePhraseString, factorDelimiter);
|
|
|
|
//Now that the source phrase is ready, we give the target phrase a copy
|
|
|
|
targetPhrase->SetSourcePhrase(sourcePhrase);
|
2011-10-17 15:37:23 +04:00
|
|
|
if (preSourceString == sourcePhraseString && preSourceNode) {
|
|
|
|
preSourceNode->Add(targetPhrase.release());
|
|
|
|
} else {
|
|
|
|
preSourceNode = CreateTargetPhraseCollection(sourcePhrase);
|
|
|
|
preSourceNode->Add(targetPhrase.release());
|
|
|
|
preSourceString.assign(sourcePhraseString.data(), sourcePhraseString.size());
|
|
|
|
}
|
2011-02-24 16:14:42 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
// sort each target phrase collection
|
|
|
|
m_collection.Sort(m_tableLimit);
|
|
|
|
|
2012-07-19 19:41:41 +04:00
|
|
|
/* // TODO ASK OLIVER WHY THIS IS NEEDED
|
2012-07-02 18:57:54 +04:00
|
|
|
const_cast<LMList&>(languageModels).CleanUpAfterSentenceProcessing();
|
2012-07-19 19:41:41 +04:00
|
|
|
*/
|
|
|
|
|
2011-02-24 16:14:42 +03:00
|
|
|
return true;
|
2008-06-11 14:52:57 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
TargetPhraseCollection *PhraseDictionaryMemory::CreateTargetPhraseCollection(const Phrase &source)
|
|
|
|
{
|
2011-02-24 16:14:42 +03:00
|
|
|
const size_t size = source.GetSize();
|
|
|
|
|
|
|
|
PhraseDictionaryNode *currNode = &m_collection;
|
|
|
|
for (size_t pos = 0 ; pos < size ; ++pos) {
|
|
|
|
const Word& word = source.GetWord(pos);
|
|
|
|
currNode = currNode->GetOrCreateChild(word);
|
|
|
|
if (currNode == NULL)
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
return currNode->CreateTargetPhraseCollection();
|
2008-06-11 14:52:57 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
const TargetPhraseCollection *PhraseDictionaryMemory::GetTargetPhraseCollection(const Phrase &source) const
|
2011-02-24 16:14:42 +03:00
|
|
|
{
|
|
|
|
// exactly like CreateTargetPhraseCollection, but don't create
|
|
|
|
const size_t size = source.GetSize();
|
|
|
|
|
|
|
|
const PhraseDictionaryNode *currNode = &m_collection;
|
|
|
|
for (size_t pos = 0 ; pos < size ; ++pos) {
|
|
|
|
const Word& word = source.GetWord(pos);
|
|
|
|
currNode = currNode->GetChild(word);
|
|
|
|
if (currNode == NULL)
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
return currNode->GetTargetPhraseCollection();
|
2008-06-11 14:52:57 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
PhraseDictionaryMemory::~PhraseDictionaryMemory()
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
TO_STRING_BODY(PhraseDictionaryMemory);
|
|
|
|
|
|
|
|
// friend
|
|
|
|
ostream& operator<<(ostream& out, const PhraseDictionaryMemory& phraseDict)
|
|
|
|
{
|
2011-02-24 16:14:42 +03:00
|
|
|
const PhraseDictionaryNode &coll = phraseDict.m_collection;
|
|
|
|
PhraseDictionaryNode::const_iterator iter;
|
|
|
|
for (iter = coll.begin() ; iter != coll.end() ; ++iter) {
|
|
|
|
const Word &word = (*iter).first;
|
|
|
|
out << word;
|
|
|
|
}
|
|
|
|
return out;
|
2008-06-11 14:52:57 +04:00
|
|
|
}
|
|
|
|
|
2008-10-09 03:51:26 +04:00
|
|
|
|
|
|
|
}
|
|
|
|
|