2010-10-12 20:17:41 +04:00
|
|
|
// $Id$
|
|
|
|
/***********************************************************************
|
|
|
|
Moses - factored phrase-based, hierarchical and syntactic language decoder
|
|
|
|
Copyright (C) 2009 Hieu Hoang
|
2011-02-24 16:57:11 +03:00
|
|
|
|
2010-10-12 20:17:41 +04:00
|
|
|
This library is free software; you can redistribute it and/or
|
|
|
|
modify it under the terms of the GNU Lesser General Public
|
|
|
|
License as published by the Free Software Foundation; either
|
|
|
|
version 2.1 of the License, or (at your option) any later version.
|
2011-02-24 16:57:11 +03:00
|
|
|
|
2010-10-12 20:17:41 +04:00
|
|
|
This library is distributed in the hope that it will be useful,
|
|
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
|
|
Lesser General Public License for more details.
|
2011-02-24 16:57:11 +03:00
|
|
|
|
2010-10-12 20:17:41 +04:00
|
|
|
You should have received a copy of the GNU Lesser General Public
|
|
|
|
License along with this library; if not, write to the Free Software
|
|
|
|
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
|
|
***********************************************************************/
|
2010-04-08 21:16:10 +04:00
|
|
|
|
|
|
|
#include <algorithm>
|
|
|
|
#include <iostream>
|
2012-11-12 23:56:18 +04:00
|
|
|
#include "moses/Util.h"
|
|
|
|
#include "moses/TargetPhraseCollection.h"
|
2012-11-27 19:08:31 +04:00
|
|
|
#include "moses/TranslationModel/PhraseDictionary.h"
|
2010-04-08 21:16:10 +04:00
|
|
|
#include "TargetPhraseCollection.h"
|
|
|
|
#include "Vocab.h"
|
|
|
|
#include "OnDiskWrapper.h"
|
|
|
|
|
|
|
|
using namespace std;
|
|
|
|
|
|
|
|
namespace OnDiskPt
|
|
|
|
{
|
|
|
|
|
|
|
|
size_t TargetPhraseCollection::s_sortScoreInd;
|
|
|
|
|
|
|
|
TargetPhraseCollection::TargetPhraseCollection()
|
2011-02-24 16:57:11 +03:00
|
|
|
:m_filePos(777)
|
2010-04-08 21:16:10 +04:00
|
|
|
{}
|
2011-02-24 16:57:11 +03:00
|
|
|
|
2010-04-08 21:16:10 +04:00
|
|
|
TargetPhraseCollection::TargetPhraseCollection(const TargetPhraseCollection ©)
|
2011-02-24 16:57:11 +03:00
|
|
|
:m_filePos(copy.m_filePos)
|
|
|
|
,m_debugStr(copy.m_debugStr)
|
2010-04-08 21:16:10 +04:00
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
TargetPhraseCollection::~TargetPhraseCollection()
|
|
|
|
{
|
2011-02-24 16:57:11 +03:00
|
|
|
Moses::RemoveAllInColl(m_coll);
|
2010-04-08 21:16:10 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
void TargetPhraseCollection::AddTargetPhrase(TargetPhrase *targetPhrase)
|
|
|
|
{
|
2011-02-24 16:57:11 +03:00
|
|
|
m_coll.push_back(targetPhrase);
|
2010-04-08 21:16:10 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
void TargetPhraseCollection::Sort(size_t tableLimit)
|
|
|
|
{
|
2011-02-24 16:57:11 +03:00
|
|
|
std::sort(m_coll.begin(), m_coll.end(), TargetPhraseOrderByScore());
|
|
|
|
|
|
|
|
if (m_coll.size() > tableLimit) {
|
|
|
|
CollType::iterator iter;
|
|
|
|
for (iter = m_coll.begin() + tableLimit ; iter != m_coll.end(); ++iter) {
|
|
|
|
delete *iter;
|
|
|
|
}
|
|
|
|
m_coll.resize(tableLimit);
|
|
|
|
}
|
2010-04-08 21:16:10 +04:00
|
|
|
}
|
2011-02-24 16:57:11 +03:00
|
|
|
|
2010-04-08 21:16:10 +04:00
|
|
|
void TargetPhraseCollection::Save(OnDiskWrapper &onDiskWrapper)
|
|
|
|
{
|
2011-02-24 16:57:11 +03:00
|
|
|
std::fstream &file = onDiskWrapper.GetFileTargetColl();
|
|
|
|
|
|
|
|
size_t memUsed = sizeof(UINT64);
|
|
|
|
char *mem = (char*) malloc(memUsed);
|
|
|
|
|
|
|
|
// size of coll
|
|
|
|
UINT64 numPhrases = GetSize();
|
|
|
|
((UINT64*)mem)[0] = numPhrases;
|
|
|
|
|
|
|
|
// MAIN LOOP
|
|
|
|
CollType::iterator iter;
|
|
|
|
for (iter = m_coll.begin(); iter != m_coll.end(); ++iter) {
|
|
|
|
// save phrase
|
2012-05-18 22:09:04 +04:00
|
|
|
TargetPhrase &targetPhrase = **iter;
|
2011-02-24 16:57:11 +03:00
|
|
|
targetPhrase.Save(onDiskWrapper);
|
|
|
|
|
|
|
|
// save coll
|
|
|
|
size_t memUsedTPOtherInfo;
|
|
|
|
char *memTPOtherInfo = targetPhrase.WriteOtherInfoToMemory(onDiskWrapper, memUsedTPOtherInfo);
|
|
|
|
|
|
|
|
// expand existing mem
|
|
|
|
mem = (char*) realloc(mem, memUsed + memUsedTPOtherInfo);
|
|
|
|
memcpy(mem + memUsed, memTPOtherInfo, memUsedTPOtherInfo);
|
|
|
|
memUsed += memUsedTPOtherInfo;
|
|
|
|
|
|
|
|
free(memTPOtherInfo);
|
|
|
|
}
|
|
|
|
|
|
|
|
// total number of bytes
|
|
|
|
//((UINT64*)mem)[0] = (UINT64) memUsed;
|
|
|
|
|
|
|
|
UINT64 startPos = file.tellp();
|
|
|
|
file.seekp(0, ios::end);
|
|
|
|
file.write((char*) mem, memUsed);
|
|
|
|
|
|
|
|
free(mem);
|
|
|
|
|
|
|
|
UINT64 endPos = file.tellp();
|
2011-11-18 16:07:41 +04:00
|
|
|
CHECK(startPos + memUsed == endPos);
|
2011-02-24 16:57:11 +03:00
|
|
|
|
|
|
|
m_filePos = startPos;
|
2010-04-08 21:16:10 +04:00
|
|
|
|
|
|
|
}
|
2011-02-24 16:57:11 +03:00
|
|
|
|
2010-04-08 21:16:10 +04:00
|
|
|
Moses::TargetPhraseCollection *TargetPhraseCollection::ConvertToMoses(const std::vector<Moses::FactorType> &inputFactors
|
2011-02-24 16:57:11 +03:00
|
|
|
, const std::vector<Moses::FactorType> &outputFactors
|
|
|
|
, const Moses::PhraseDictionary &phraseDict
|
|
|
|
, const std::vector<float> &weightT
|
|
|
|
, const Moses::WordPenaltyProducer* wpProducer
|
|
|
|
, const Moses::LMList &lmList
|
2011-02-24 19:17:38 +03:00
|
|
|
, const std::string & /* filePath */
|
2011-02-24 16:57:11 +03:00
|
|
|
, Vocab &vocab) const
|
2010-04-08 21:16:10 +04:00
|
|
|
{
|
2011-02-24 16:57:11 +03:00
|
|
|
Moses::TargetPhraseCollection *ret = new Moses::TargetPhraseCollection();
|
|
|
|
|
|
|
|
CollType::const_iterator iter;
|
|
|
|
for (iter = m_coll.begin(); iter != m_coll.end(); ++iter) {
|
|
|
|
const TargetPhrase &tp = **iter;
|
|
|
|
Moses::TargetPhrase *mosesPhrase = tp.ConvertToMoses(inputFactors, outputFactors
|
|
|
|
, vocab
|
|
|
|
, phraseDict
|
|
|
|
, weightT
|
|
|
|
, wpProducer
|
|
|
|
, lmList);
|
|
|
|
|
|
|
|
/*
|
|
|
|
// debugging output
|
|
|
|
stringstream strme;
|
|
|
|
strme << filePath << " " << *mosesPhrase;
|
|
|
|
mosesPhrase->SetDebugOutput(strme.str());
|
|
|
|
*/
|
|
|
|
|
|
|
|
ret->Add(mosesPhrase);
|
|
|
|
}
|
|
|
|
|
2011-06-27 19:13:15 +04:00
|
|
|
ret->Sort(true, phraseDict.GetTableLimit());
|
2011-02-24 16:57:11 +03:00
|
|
|
|
|
|
|
return ret;
|
|
|
|
|
2010-04-08 21:16:10 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
void TargetPhraseCollection::ReadFromFile(size_t tableLimit, UINT64 filePos, OnDiskWrapper &onDiskWrapper)
|
|
|
|
{
|
2011-02-24 16:57:11 +03:00
|
|
|
fstream &fileTPColl = onDiskWrapper.GetFileTargetColl();
|
|
|
|
fstream &fileTP = onDiskWrapper.GetFileTargetInd();
|
2012-05-18 22:09:04 +04:00
|
|
|
|
2011-02-24 16:57:11 +03:00
|
|
|
size_t numScores = onDiskWrapper.GetNumScores();
|
2012-05-18 22:09:04 +04:00
|
|
|
|
2011-02-24 16:57:11 +03:00
|
|
|
|
|
|
|
UINT64 numPhrases;
|
|
|
|
|
|
|
|
UINT64 currFilePos = filePos;
|
|
|
|
fileTPColl.seekg(filePos);
|
|
|
|
fileTPColl.read((char*) &numPhrases, sizeof(UINT64));
|
|
|
|
|
|
|
|
// table limit
|
|
|
|
numPhrases = std::min(numPhrases, (UINT64) tableLimit);
|
|
|
|
|
|
|
|
currFilePos += sizeof(UINT64);
|
2012-05-18 22:09:04 +04:00
|
|
|
|
2011-02-24 16:57:11 +03:00
|
|
|
for (size_t ind = 0; ind < numPhrases; ++ind) {
|
2012-05-18 22:09:04 +04:00
|
|
|
TargetPhrase *tp = new TargetPhrase(numScores);
|
2011-02-24 16:57:11 +03:00
|
|
|
|
|
|
|
UINT64 sizeOtherInfo = tp->ReadOtherInfoFromFile(currFilePos, fileTPColl);
|
2012-06-16 00:44:45 +04:00
|
|
|
tp->ReadFromFile(fileTP);
|
2011-02-24 16:57:11 +03:00
|
|
|
|
|
|
|
currFilePos += sizeOtherInfo;
|
|
|
|
|
|
|
|
m_coll.push_back(tp);
|
|
|
|
}
|
2010-04-08 21:16:10 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
UINT64 TargetPhraseCollection::GetFilePos() const
|
2011-02-24 16:57:11 +03:00
|
|
|
{
|
|
|
|
return m_filePos;
|
|
|
|
}
|
2010-04-08 21:16:10 +04:00
|
|
|
|
|
|
|
const std::string TargetPhraseCollection::GetDebugStr() const
|
2011-02-24 16:57:11 +03:00
|
|
|
{
|
|
|
|
return m_debugStr;
|
|
|
|
}
|
|
|
|
|
2010-04-08 21:16:10 +04:00
|
|
|
void TargetPhraseCollection::SetDebugStr(const std::string &str)
|
2011-02-24 16:57:11 +03:00
|
|
|
{
|
|
|
|
m_debugStr = str;
|
|
|
|
}
|
2010-04-08 21:16:10 +04:00
|
|
|
|
2012-05-15 19:03:40 +04:00
|
|
|
const TargetPhrase &TargetPhraseCollection::GetTargetPhrase(size_t ind) const
|
|
|
|
{
|
|
|
|
assert(ind < GetSize());
|
|
|
|
return *m_coll[ind];
|
|
|
|
}
|
|
|
|
|
2010-04-08 21:16:10 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
|