mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-12-26 13:23:25 +03:00
251 lines
8.2 KiB
C++
251 lines
8.2 KiB
C++
/***********************************************************************
|
|
Moses - factored phrase-based language decoder
|
|
Copyright (C) University of Edinburgh
|
|
|
|
This library is free software; you can redistribute it and/or
|
|
modify it under the terms of the GNU Lesser General Public
|
|
License as published by the Free Software Foundation; either
|
|
version 2.1 of the License, or (at your option) any later version.
|
|
|
|
This library is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
Lesser General Public License for more details.
|
|
|
|
You should have received a copy of the GNU Lesser General Public
|
|
License along with this library; if not, write to the Free Software
|
|
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
***********************************************************************/
|
|
|
|
#include "PropertiesConsolidator.h"
|
|
|
|
#include <sstream>
|
|
#include <limits>
|
|
#include <vector>
|
|
|
|
#include "moses/Util.h"
|
|
#include "phrase-extract/InputFileStream.h"
|
|
#include "phrase-extract/OutputFileStream.h"
|
|
|
|
|
|
namespace MosesTraining
|
|
{
|
|
|
|
void PropertiesConsolidator::ActivateSourceLabelsProcessing(const std::string &sourceLabelSetFile)
|
|
{
|
|
Moses::InputFileStream inFile(sourceLabelSetFile);
|
|
|
|
// read source label set
|
|
m_sourceLabels.clear();
|
|
std::string line;
|
|
while (getline(inFile, line)) {
|
|
std::istringstream tokenizer(line);
|
|
std::string label;
|
|
size_t index;
|
|
try {
|
|
tokenizer >> label >> index;
|
|
} catch (const std::exception &e) {
|
|
UTIL_THROW2("Error reading source label set file " << sourceLabelSetFile << " .");
|
|
}
|
|
std::pair< std::map<std::string,size_t>::iterator, bool > inserted = m_sourceLabels.insert( std::pair<std::string,size_t>(label,index) );
|
|
UTIL_THROW_IF2(!inserted.second,"Source label set file " << sourceLabelSetFile << " should contain each syntactic label only once.");
|
|
}
|
|
|
|
inFile.Close();
|
|
|
|
m_sourceLabelsFlag = true;
|
|
}
|
|
|
|
|
|
void PropertiesConsolidator::ActivatePartsOfSpeechProcessing(const std::string &partsOfSpeechFile)
|
|
{
|
|
Moses::InputFileStream inFile(partsOfSpeechFile);
|
|
|
|
// read parts-of-speech vocabulary
|
|
m_partsOfSpeechVocabulary.clear();
|
|
std::string line;
|
|
while (getline(inFile, line)) {
|
|
std::istringstream tokenizer(line);
|
|
std::string label;
|
|
size_t index;
|
|
try {
|
|
tokenizer >> label >> index;
|
|
} catch (const std::exception &e) {
|
|
UTIL_THROW2("Error reading part-of-speech vocabulary file " << partsOfSpeechFile << " .");
|
|
}
|
|
std::pair< std::map<std::string,size_t>::iterator, bool > inserted = m_partsOfSpeechVocabulary.insert( std::pair<std::string,size_t>(label,index) );
|
|
UTIL_THROW_IF2(!inserted.second,"Part-of-speech vocabulary file " << partsOfSpeechFile << " should contain each POS tag only once.");
|
|
}
|
|
|
|
inFile.Close();
|
|
|
|
m_partsOfSpeechFlag = true;
|
|
}
|
|
|
|
|
|
void PropertiesConsolidator::ProcessPropertiesString(const std::string &propertiesString, Moses::OutputFileStream& out) const
|
|
{
|
|
if ( propertiesString.empty() ) {
|
|
return;
|
|
}
|
|
|
|
std::vector<std::string> toks;
|
|
Moses::TokenizeMultiCharSeparator(toks, propertiesString, "{{");
|
|
for (size_t i = 1; i < toks.size(); ++i) {
|
|
std::string &tok = toks[i];
|
|
if (tok.empty()) {
|
|
continue;
|
|
}
|
|
size_t endPos = tok.rfind("}");
|
|
tok = tok.substr(0, endPos - 1);
|
|
std::vector<std::string> keyValue = Moses::TokenizeFirstOnly(tok, " ");
|
|
assert(keyValue.size() == 2);
|
|
|
|
if ( !keyValue[0].compare("SourceLabels") ) {
|
|
|
|
if ( m_sourceLabelsFlag ) {
|
|
|
|
// SourceLabels property: replace strings with vocabulary indices
|
|
out << " {{" << keyValue[0];
|
|
ProcessSourceLabelsPropertyValue(keyValue[1], out);
|
|
out << "}}";
|
|
|
|
} else { // don't process SourceLabels property
|
|
out << " {{" << keyValue[0] << " " << keyValue[1] << "}}";
|
|
}
|
|
|
|
} else if ( !keyValue[0].compare("POS") ) {
|
|
|
|
/* DO NOTHING (property is not registered in the decoder at the moment)
|
|
if ( m_partsOfSpeechFlag ) {
|
|
|
|
// POS property: replace strings with vocabulary indices
|
|
out << " {{" << keyValue[0];
|
|
ProcessPOSPropertyValue(keyValue[1], out);
|
|
out << "}}";
|
|
|
|
} else { // don't process POS property
|
|
out << " {{" << keyValue[0] << " " << keyValue[1] << "}}";
|
|
}
|
|
*/
|
|
|
|
} else {
|
|
|
|
// output other property
|
|
out << " {{" << keyValue[0] << " " << keyValue[1] << "}}";
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
void PropertiesConsolidator::ProcessSourceLabelsPropertyValue(const std::string &value, Moses::OutputFileStream& out) const
|
|
{
|
|
// SourceLabels property: replace strings with vocabulary indices
|
|
std::istringstream tokenizer(value);
|
|
|
|
size_t nNTs;
|
|
double totalCount;
|
|
|
|
if (! (tokenizer >> nNTs)) { // first token: number of non-terminals (incl. left-hand side)
|
|
UTIL_THROW2("Not able to read number of non-terminals from SourceLabels property. "
|
|
<< "Flawed SourceLabels property?");
|
|
}
|
|
assert( nNTs > 0 );
|
|
out << " " << nNTs;
|
|
|
|
if (! (tokenizer >> totalCount)) { // second token: overall rule count
|
|
UTIL_THROW2("Not able to read overall rule count from SourceLabels property. "
|
|
<< "Flawed SourceLabels property?");
|
|
}
|
|
assert( totalCount > 0.0 );
|
|
out << " " << totalCount;
|
|
|
|
while (tokenizer.peek() != EOF) {
|
|
try {
|
|
|
|
size_t numberOfLHSsGivenRHS = std::numeric_limits<std::size_t>::max();
|
|
|
|
std::string token;
|
|
|
|
if (nNTs > 1) { // rule has right-hand side non-terminals, i.e. it's a hierarchical rule
|
|
for (size_t i=0; i<nNTs-1; ++i) { // RHS source non-terminal labels
|
|
tokenizer >> token; // RHS source non-terminal label
|
|
std::map<std::string,size_t>::const_iterator found = m_sourceLabels.find(token);
|
|
UTIL_THROW_IF2(found == m_sourceLabels.end(), "Label \"" << token << "\" from the phrase table not found in given label set.");
|
|
out << " " << found->second;
|
|
}
|
|
|
|
tokenizer >> token; // sourceLabelsRHSCount
|
|
out << " " << token;
|
|
|
|
tokenizer >> numberOfLHSsGivenRHS;
|
|
out << " " << numberOfLHSsGivenRHS;
|
|
}
|
|
|
|
for (size_t i=0; i<numberOfLHSsGivenRHS && tokenizer.peek()!=EOF; ++i) { // LHS source non-terminal labels seen with this RHS
|
|
tokenizer >> token; // LHS source non-terminal label
|
|
std::map<std::string,size_t>::const_iterator found = m_sourceLabels.find(token);
|
|
UTIL_THROW_IF2(found == m_sourceLabels.end() ,"Label \"" << token << "\" from the phrase table not found in given label set.");
|
|
out << " " << found->second;
|
|
|
|
tokenizer >> token; // ruleSourceLabelledCount
|
|
out << " " << token;
|
|
}
|
|
|
|
} catch (const std::exception &e) {
|
|
UTIL_THROW2("Flawed item in SourceLabels property?");
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
void PropertiesConsolidator::ProcessPOSPropertyValue(const std::string &value, Moses::OutputFileStream& out) const
|
|
{
|
|
std::istringstream tokenizer(value);
|
|
while (tokenizer.peek() != EOF) {
|
|
std::string token;
|
|
tokenizer >> token;
|
|
std::map<std::string,size_t>::const_iterator found = m_partsOfSpeechVocabulary.find(token);
|
|
UTIL_THROW_IF2(found == m_partsOfSpeechVocabulary.end() ,"Part-of-speech \"" << token << "\" from the phrase table not found in given part-of-speech vocabulary.");
|
|
out << " " << found->second;
|
|
}
|
|
}
|
|
|
|
|
|
bool PropertiesConsolidator::GetPOSPropertyValueFromPropertiesString(const std::string &propertiesString, std::vector<std::string>& out) const
|
|
{
|
|
out.clear();
|
|
if ( propertiesString.empty() ) {
|
|
return false;
|
|
}
|
|
|
|
std::vector<std::string> toks;
|
|
Moses::TokenizeMultiCharSeparator(toks, propertiesString, "{{");
|
|
for (size_t i = 1; i < toks.size(); ++i) {
|
|
std::string &tok = toks[i];
|
|
if (tok.empty()) {
|
|
continue;
|
|
}
|
|
size_t endPos = tok.rfind("}");
|
|
tok = tok.substr(0, endPos - 1);
|
|
std::vector<std::string> keyValue = Moses::TokenizeFirstOnly(tok, " ");
|
|
assert(keyValue.size() == 2);
|
|
|
|
if ( !keyValue[0].compare("POS") ) {
|
|
std::istringstream tokenizer(keyValue[1]);
|
|
while (tokenizer.peek() != EOF) {
|
|
std::string token;
|
|
tokenizer >> token;
|
|
out.push_back(token);
|
|
}
|
|
return true;
|
|
}
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
|
|
} // namespace MosesTraining
|
|
|