Fixed queryPhraseTableMin, added warnings for compacting phrase tables qithout alignment

This commit is contained in:
Marcin Junczys-Dowmunt 2013-07-08 09:58:02 +02:00
parent 69b7bd3336
commit d3b4c11be2
5 changed files with 36 additions and 22 deletions

View File

@ -51,8 +51,8 @@ int main(int argc, char **argv)
const_cast<std::vector<std::string>&>(parameter->GetParam("factor-delimiter")).resize(1, "||dummy_string||");
const_cast<std::vector<std::string>&>(parameter->GetParam("input-factors")).resize(1, "0");
const_cast<std::vector<std::string>&>(parameter->GetParam("verbose")).resize(1, "0");
const_cast<std::vector<std::string>&>(parameter->GetParam("weight-w")).resize(1, "0");
const_cast<std::vector<std::string>&>(parameter->GetParam("weight-d")).resize(1, "0");
//const_cast<std::vector<std::string>&>(parameter->GetParam("weight-w")).resize(1, "0");
//const_cast<std::vector<std::string>&>(parameter->GetParam("weight-d")).resize(1, "0");
StaticData::InstanceNonConst().LoadData(parameter);

View File

@ -190,7 +190,7 @@ std::string PhraseDecoder::MakeSourceKey(std::string &source)
return source + m_separator;
}
TargetPhraseVectorPtr PhraseDecoder::CreateTargetPhraseCollection(const Phrase &sourcePhrase, bool topLevel)
TargetPhraseVectorPtr PhraseDecoder::CreateTargetPhraseCollection(const Phrase &sourcePhrase, bool topLevel, bool eval)
{
// Not using TargetPhraseCollection avoiding "new" operator
@ -234,7 +234,7 @@ TargetPhraseVectorPtr PhraseDecoder::CreateTargetPhraseCollection(const Phrase &
// Decompress and decode target phrase collection
TargetPhraseVectorPtr decodedPhraseColl =
DecodeCollection(tpv, encodedBitStream, sourcePhrase, topLevel);
DecodeCollection(tpv, encodedBitStream, sourcePhrase, topLevel, eval);
return decodedPhraseColl;
} else
@ -243,7 +243,7 @@ TargetPhraseVectorPtr PhraseDecoder::CreateTargetPhraseCollection(const Phrase &
TargetPhraseVectorPtr PhraseDecoder::DecodeCollection(
TargetPhraseVectorPtr tpv, BitWrapper<> &encodedBitStream,
const Phrase &sourcePhrase, bool topLevel)
const Phrase &sourcePhrase, bool topLevel, bool eval)
{
bool extending = tpv->size();
@ -397,7 +397,8 @@ TargetPhraseVectorPtr PhraseDecoder::DecodeCollection(
if(scores.size() == m_numScoreComponent) {
targetPhrase->GetScoreBreakdown().Assign(&m_phraseDictionary, scores);
targetPhrase->Evaluate(sourcePhrase);
if(eval)
targetPhrase->Evaluate(sourcePhrase);
if(m_containsAlignmentInfo)
state = Alignment;

View File

@ -131,12 +131,13 @@ public:
size_t Load(std::FILE* in);
TargetPhraseVectorPtr CreateTargetPhraseCollection(const Phrase &sourcePhrase,
bool topLevel = false);
bool topLevel = false, bool eval = true);
TargetPhraseVectorPtr DecodeCollection(TargetPhraseVectorPtr tpv,
BitWrapper<> &encodedBitStream,
const Phrase &sourcePhrase,
bool topLevel);
bool topLevel,
bool eval);
void PruneCache();
};

View File

@ -117,7 +117,7 @@ PhraseDictionaryCompact::GetTargetPhraseCollection(const Phrase &sourcePhrase) c
// Retrieve target phrase collection from phrase table
TargetPhraseVectorPtr decodedPhraseColl
= m_phraseDecoder->CreateTargetPhraseCollection(sourcePhrase, true);
= m_phraseDecoder->CreateTargetPhraseCollection(sourcePhrase, true, true);
if(decodedPhraseColl != NULL && decodedPhraseColl->size()) {
TargetPhraseVectorPtr tpv(new TargetPhraseVector(*decodedPhraseColl));
@ -151,7 +151,7 @@ PhraseDictionaryCompact::GetTargetPhraseCollectionRaw(const Phrase &sourcePhrase
return TargetPhraseVectorPtr();
// Retrieve target phrase collection from phrase table
return m_phraseDecoder->CreateTargetPhraseCollection(sourcePhrase, true);
return m_phraseDecoder->CreateTargetPhraseCollection(sourcePhrase, true, false);
}
PhraseDictionaryCompact::~PhraseDictionaryCompact()

View File

@ -38,7 +38,7 @@ bool operator<(const PackedItem &pi1, const PackedItem &pi2)
}
std::string PhraseTableCreator::m_phraseStopSymbol = "__SPECIAL_STOP_SYMBOL__";
std::string PhraseTableCreator::m_separator = " ||| ";
std::string PhraseTableCreator::m_separator = "|||";
PhraseTableCreator::PhraseTableCreator(std::string inPath,
std::string outPath,
@ -332,12 +332,12 @@ void PhraseTableCreator::CreateRankHash()
inline std::string PhraseTableCreator::MakeSourceKey(std::string &source)
{
return source + m_separator;
return source + " " + m_separator + " ";
}
inline std::string PhraseTableCreator::MakeSourceTargetKey(std::string &source, std::string &target)
{
return source + m_separator + target + m_separator;
return source + " " + m_separator + " " + target + " " + m_separator + " ";
}
void PhraseTableCreator::EncodeTargetPhrases()
@ -1034,17 +1034,24 @@ void RankingTask::operator()()
for(size_t i = 0; i < lines.size(); i++) {
std::vector<std::string> tokens;
Moses::TokenizeMultiCharSeparator(tokens, lines[i], m_creator.m_separator);
if(tokens.size() < 3) {
for(std::vector<std::string>::iterator it = tokens.begin(); it != tokens.end(); it++)
*it = Moses::Trim(*it);
if(tokens.size() < 4) {
std::cerr << "Error: It seems the following line has a wrong format:" << std::endl;
std::cerr << "Line " << i << ": " << lines[i] << std::endl;
abort();
}
if(tokens.size() == 3 && m_creator.m_warnMe) {
std::cerr << "Warning: It seems the following line contains no alignment information, " << std::endl;
std::cerr << "but you are using PREnc encoding which makes use of alignment data. " << std::endl;
std::cerr << "Better use -encoding None or disable this warning with -no-warnings ." << std::endl;
if(tokens[3].size() <= 1 && m_creator.m_coding != PhraseTableCreator::None) {
std::cerr << "Error: It seems the following line contains no alignment information, " << std::endl;
std::cerr << "but you are using ";
std::cerr << (m_creator.m_coding == PhraseTableCreator::PREnc ? "PREnc" : "REnc");
std::cerr << " encoding which makes use of alignment data. " << std::endl;
std::cerr << "Use -encoding None" << std::endl;
std::cerr << "Line " << i << ": " << lines[i] << std::endl;
abort();
}
std::vector<float> scores = Tokenize<float>(tokens[2]);
@ -1125,18 +1132,23 @@ void EncodingTask::operator()()
std::vector<std::string> tokens;
Moses::TokenizeMultiCharSeparator(tokens, lines[i], m_creator.m_separator);
for(std::vector<std::string>::iterator it = tokens.begin(); it != tokens.end(); it++)
*it = Moses::Trim(*it);
if(tokens.size() < 3) {
std::cerr << "Error: It seems the following line has a wrong format:" << std::endl;
std::cerr << "Line " << i << ": " << lines[i] << std::endl;
abort();
}
if(tokens.size() == 3 && m_creator.m_coding != PhraseTableCreator::None && m_creator.m_warnMe) {
std::cerr << "Warning: It seems the following line contains no alignment information, " << std::endl;
if(tokens[3].size() <= 1 && m_creator.m_coding != PhraseTableCreator::None) {
std::cerr << "Error: It seems the following line contains no alignment information, " << std::endl;
std::cerr << "but you are using ";
std::cerr << (m_creator.m_coding == PhraseTableCreator::PREnc ? "PREnc" : "REnc");
std::cerr << " encoding which makes use of alignment data. " << std::endl;
std::cerr << "Better use -encoding None or disable this warning with -no-warnings." << std::endl;
std::cerr << "Use -encoding None" << std::endl;
std::cerr << "Line " << i << ": " << lines[i] << std::endl;
abort();
}
size_t ownRank = 0;