2014-05-30 14:27:59 +04:00
|
|
|
#include <fstream>
|
|
|
|
|
2014-06-10 01:17:05 +04:00
|
|
|
#include "moses/FactorCollection.h"
|
|
|
|
#include "moses/InputPath.h"
|
2014-05-30 14:27:59 +04:00
|
|
|
#include "moses/Util.h"
|
2014-06-25 00:50:20 +04:00
|
|
|
|
2014-05-30 14:27:59 +04:00
|
|
|
#include "util/exception.hh"
|
|
|
|
|
2014-06-25 00:50:20 +04:00
|
|
|
#include "util/file_piece.hh"
|
|
|
|
#include "util/string_piece.hh"
|
|
|
|
#include "util/tokenize_piece.hh"
|
|
|
|
|
2014-06-10 01:17:05 +04:00
|
|
|
#include "LexicalReordering.h"
|
2014-05-27 14:05:56 +04:00
|
|
|
#include "SparseReordering.h"
|
|
|
|
|
2014-05-30 14:27:59 +04:00
|
|
|
|
2014-05-27 14:05:56 +04:00
|
|
|
using namespace std;
|
|
|
|
|
|
|
|
namespace Moses
|
|
|
|
{
|
|
|
|
|
2014-07-25 00:23:08 +04:00
|
|
|
const std::string& SparseReorderingFeatureKey::Name (const string& wordListId) {
|
2014-06-13 00:37:18 +04:00
|
|
|
static string kSep = "-";
|
|
|
|
static string name;
|
|
|
|
ostringstream buf;
|
|
|
|
// type side position id word reotype
|
|
|
|
if (type == Phrase) {
|
|
|
|
buf << "phr";
|
|
|
|
} else if (type == Stack) {
|
|
|
|
buf << "stk";
|
|
|
|
} else if (type == Between) {
|
|
|
|
buf << "btn";
|
|
|
|
}
|
|
|
|
buf << kSep;
|
|
|
|
if (side == Source) {
|
|
|
|
buf << "src";
|
|
|
|
} else if (side == Target) {
|
|
|
|
buf << "tgt";
|
|
|
|
}
|
|
|
|
buf << kSep;
|
|
|
|
if (position == First) {
|
|
|
|
buf << "first";
|
|
|
|
} else if (position == Last) {
|
|
|
|
buf << "last";
|
|
|
|
}
|
|
|
|
buf << kSep;
|
|
|
|
buf << wordListId;
|
|
|
|
buf << kSep;
|
2014-06-25 00:50:20 +04:00
|
|
|
if (isCluster) buf << "cluster_";
|
2014-06-13 00:37:18 +04:00
|
|
|
buf << word->GetString();
|
|
|
|
buf << kSep;
|
|
|
|
buf << reoType;
|
|
|
|
name = buf.str();
|
|
|
|
return name;
|
|
|
|
}
|
|
|
|
|
2014-06-10 01:17:05 +04:00
|
|
|
SparseReordering::SparseReordering(const map<string,string>& config, const LexicalReordering* producer)
|
|
|
|
: m_producer(producer)
|
2014-05-27 14:05:56 +04:00
|
|
|
{
|
2014-05-30 14:27:59 +04:00
|
|
|
static const string kSource= "source";
|
|
|
|
static const string kTarget = "target";
|
2014-05-27 14:05:56 +04:00
|
|
|
for (map<string,string>::const_iterator i = config.begin(); i != config.end(); ++i) {
|
2014-05-30 14:27:59 +04:00
|
|
|
vector<string> fields = Tokenize(i->first, "-");
|
|
|
|
if (fields[0] == "words") {
|
|
|
|
UTIL_THROW_IF(!(fields.size() == 3), util::Exception, "Sparse reordering word list name should be sparse-words-(source|target)-<id>");
|
|
|
|
if (fields[1] == kSource) {
|
2014-06-13 00:37:18 +04:00
|
|
|
ReadWordList(i->second,fields[2], SparseReorderingFeatureKey::Source, &m_sourceWordLists);
|
2014-05-30 14:27:59 +04:00
|
|
|
} else if (fields[1] == kTarget) {
|
2014-06-13 00:37:18 +04:00
|
|
|
ReadWordList(i->second,fields[2],SparseReorderingFeatureKey::Target, &m_targetWordLists);
|
2014-05-30 14:27:59 +04:00
|
|
|
} else {
|
|
|
|
UTIL_THROW(util::Exception, "Sparse reordering requires source or target, not " << fields[1]);
|
|
|
|
}
|
|
|
|
} else if (fields[0] == "clusters") {
|
2014-06-25 00:50:20 +04:00
|
|
|
UTIL_THROW_IF(!(fields.size() == 3), util::Exception, "Sparse reordering cluster name should be sparse-clusters-(source|target)-<id>");
|
|
|
|
if (fields[1] == kSource) {
|
|
|
|
ReadClusterMap(i->second,fields[2], SparseReorderingFeatureKey::Source, &m_sourceClusterMaps);
|
|
|
|
} else if (fields[1] == kTarget) {
|
|
|
|
ReadClusterMap(i->second,fields[2],SparseReorderingFeatureKey::Target, &m_targetClusterMaps);
|
|
|
|
} else {
|
|
|
|
UTIL_THROW(util::Exception, "Sparse reordering requires source or target, not " << fields[1]);
|
|
|
|
}
|
|
|
|
|
2014-05-30 14:27:59 +04:00
|
|
|
} else if (fields[0] == "phrase") {
|
|
|
|
m_usePhrase = true;
|
|
|
|
} else if (fields[0] == "stack") {
|
|
|
|
m_useStack = true;
|
|
|
|
} else if (fields[0] == "between") {
|
|
|
|
m_useBetween = true;
|
|
|
|
} else {
|
|
|
|
UTIL_THROW(util::Exception, "Unable to parse sparse reordering option: " << i->first);
|
|
|
|
}
|
2014-05-27 14:05:56 +04:00
|
|
|
}
|
2014-06-13 00:37:18 +04:00
|
|
|
|
2014-05-27 14:05:56 +04:00
|
|
|
}
|
|
|
|
|
2014-06-25 00:50:20 +04:00
|
|
|
void SparseReordering::PreCalculateFeatureNames(size_t index, const string& id, SparseReorderingFeatureKey::Side side, const Factor* factor, bool isCluster) {
|
|
|
|
for (size_t type = SparseReorderingFeatureKey::Stack;
|
|
|
|
type <= SparseReorderingFeatureKey::Between; ++type) {
|
|
|
|
for (size_t position = SparseReorderingFeatureKey::First;
|
|
|
|
position <= SparseReorderingFeatureKey::Last; ++position) {
|
|
|
|
for (int reoType = 0; reoType <= LexicalReorderingState::MAX; ++reoType) {
|
|
|
|
SparseReorderingFeatureKey key(
|
|
|
|
index, static_cast<SparseReorderingFeatureKey::Type>(type), factor, isCluster,
|
|
|
|
static_cast<SparseReorderingFeatureKey::Position>(position), side, reoType);
|
2014-07-25 00:23:08 +04:00
|
|
|
m_featureMap.insert(pair<SparseReorderingFeatureKey, FName>(key,m_producer->GetFeatureName(key.Name(id))));
|
2014-06-25 00:50:20 +04:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-06-13 00:37:18 +04:00
|
|
|
void SparseReordering::ReadWordList(const string& filename, const string& id, SparseReorderingFeatureKey::Side side, vector<WordList>* pWordLists) {
|
2014-05-30 14:27:59 +04:00
|
|
|
ifstream fh(filename.c_str());
|
2014-06-10 01:17:05 +04:00
|
|
|
UTIL_THROW_IF(!fh, util::Exception, "Unable to open: " << filename);
|
2014-05-30 14:27:59 +04:00
|
|
|
string line;
|
|
|
|
pWordLists->push_back(WordList());
|
|
|
|
pWordLists->back().first = id;
|
|
|
|
while (getline(fh,line)) {
|
2014-06-10 01:17:05 +04:00
|
|
|
//TODO: StringPiece
|
|
|
|
const Factor* factor = FactorCollection::Instance().AddFactor(line);
|
|
|
|
pWordLists->back().second.insert(factor);
|
2014-06-25 00:50:20 +04:00
|
|
|
PreCalculateFeatureNames(pWordLists->size()-1, id, side, factor, false);
|
2014-06-13 00:37:18 +04:00
|
|
|
|
2014-05-30 14:27:59 +04:00
|
|
|
}
|
|
|
|
}
|
2014-05-27 14:05:56 +04:00
|
|
|
|
2014-06-25 00:50:20 +04:00
|
|
|
void SparseReordering::ReadClusterMap(const string& filename, const string& id, SparseReorderingFeatureKey::Side side, vector<ClusterMap>* pClusterMaps) {
|
|
|
|
pClusterMaps->push_back(ClusterMap());
|
|
|
|
pClusterMaps->back().first = id;
|
|
|
|
util::FilePiece file(filename.c_str());
|
|
|
|
StringPiece line;
|
|
|
|
while (true) {
|
|
|
|
try {
|
|
|
|
line = file.ReadLine();
|
|
|
|
} catch (const util::EndOfFileException &e) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
util::TokenIter<util::SingleCharacter, true> lineIter(line,util::SingleCharacter('\t'));
|
2014-06-30 15:13:33 +04:00
|
|
|
if (!lineIter) UTIL_THROW(util::Exception, "Malformed cluster line (missing word): '" << line << "'");
|
2014-06-25 00:50:20 +04:00
|
|
|
const Factor* wordFactor = FactorCollection::Instance().AddFactor(*lineIter);
|
|
|
|
++lineIter;
|
2014-06-30 15:13:33 +04:00
|
|
|
if (!lineIter) UTIL_THROW(util::Exception, "Malformed cluster line (missing cluster id): '" << line << "'");
|
2014-06-25 00:50:20 +04:00
|
|
|
const Factor* idFactor = FactorCollection::Instance().AddFactor(*lineIter);
|
|
|
|
pClusterMaps->back().second[wordFactor] = idFactor;
|
|
|
|
PreCalculateFeatureNames(pClusterMaps->size()-1, id, side, idFactor, true);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void SparseReordering::AddFeatures(
|
2014-06-13 00:37:18 +04:00
|
|
|
SparseReorderingFeatureKey::Type type, SparseReorderingFeatureKey::Side side,
|
|
|
|
const Word& word, SparseReorderingFeatureKey::Position position,
|
2014-06-25 00:50:20 +04:00
|
|
|
LexicalReorderingState::ReorderingType reoType,
|
2014-06-10 01:17:05 +04:00
|
|
|
ScoreComponentCollection* scores) const {
|
|
|
|
|
|
|
|
const Factor* wordFactor = word.GetFactor(0);
|
2014-06-25 00:50:20 +04:00
|
|
|
|
|
|
|
const vector<WordList>* wordLists;
|
|
|
|
const vector<ClusterMap>* clusterMaps;
|
|
|
|
if (side == SparseReorderingFeatureKey::Source) {
|
|
|
|
wordLists = &m_sourceWordLists;
|
|
|
|
clusterMaps = &m_sourceClusterMaps;
|
|
|
|
} else {
|
|
|
|
wordLists = &m_targetWordLists;
|
|
|
|
clusterMaps = &m_targetClusterMaps;
|
|
|
|
}
|
|
|
|
|
|
|
|
for (size_t id = 0; id < wordLists->size(); ++id) {
|
|
|
|
if ((*wordLists)[id].second.find(wordFactor) == (*wordLists)[id].second.end()) continue;
|
|
|
|
SparseReorderingFeatureKey key(id, type, wordFactor, false, position, side, reoType);
|
|
|
|
FeatureMap::const_iterator fmi = m_featureMap.find(key);
|
|
|
|
assert(fmi != m_featureMap.end());
|
2014-07-25 00:23:08 +04:00
|
|
|
scores->SparsePlusEquals(fmi->second, 1.0);
|
2014-06-25 00:50:20 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
for (size_t id = 0; id < clusterMaps->size(); ++id) {
|
|
|
|
const ClusterMap& clusterMap = (*clusterMaps)[id];
|
|
|
|
boost::unordered_map<const Factor*, const Factor*>::const_iterator clusterIter
|
|
|
|
= clusterMap.second.find(wordFactor);
|
|
|
|
if (clusterIter != clusterMap.second.end()) {
|
|
|
|
SparseReorderingFeatureKey key(id, type, clusterIter->second, true, position, side, reoType);
|
|
|
|
FeatureMap::const_iterator fmi = m_featureMap.find(key);
|
|
|
|
assert(fmi != m_featureMap.end());
|
2014-07-25 00:23:08 +04:00
|
|
|
scores->SparsePlusEquals(fmi->second, 1.0);
|
2014-06-25 00:50:20 +04:00
|
|
|
}
|
|
|
|
}
|
2014-06-10 01:17:05 +04:00
|
|
|
|
|
|
|
}
|
|
|
|
|
2014-06-07 00:08:09 +04:00
|
|
|
void SparseReordering::CopyScores(
|
2014-06-24 16:37:54 +04:00
|
|
|
const TranslationOption& currentOpt,
|
|
|
|
const TranslationOption* previousOpt,
|
|
|
|
const InputType& input,
|
2014-05-27 14:05:56 +04:00
|
|
|
LexicalReorderingState::ReorderingType reoType,
|
|
|
|
LexicalReorderingConfiguration::Direction direction,
|
|
|
|
ScoreComponentCollection* scores) const
|
|
|
|
{
|
2014-06-24 16:37:54 +04:00
|
|
|
if (m_useBetween && direction == LexicalReorderingConfiguration::Backward &&
|
|
|
|
(reoType == LexicalReorderingState::D || reoType == LexicalReorderingState::DL ||
|
|
|
|
reoType == LexicalReorderingState::DR)) {
|
|
|
|
size_t gapStart, gapEnd;
|
2014-07-24 19:01:54 +04:00
|
|
|
//NB: Using a static cast for speed, but could be nasty if
|
|
|
|
//using non-sentence input
|
|
|
|
const Sentence& sentence = static_cast<const Sentence&>(input);
|
2014-06-24 16:37:54 +04:00
|
|
|
const WordsRange& currentRange = currentOpt.GetSourceWordsRange();
|
|
|
|
if (previousOpt) {
|
|
|
|
const WordsRange& previousRange = previousOpt->GetSourceWordsRange();
|
|
|
|
if (previousRange < currentRange) {
|
|
|
|
gapStart = previousRange.GetEndPos() + 1;
|
|
|
|
gapEnd = currentRange.GetStartPos();
|
|
|
|
} else {
|
|
|
|
gapStart = currentRange.GetEndPos() + 1;
|
|
|
|
gapEnd = previousRange.GetStartPos();
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
//start of sentence
|
|
|
|
gapStart = 0;
|
|
|
|
gapEnd = currentRange.GetStartPos();
|
|
|
|
}
|
|
|
|
assert(gapStart < gapEnd);
|
|
|
|
for (size_t i = gapStart; i < gapEnd; ++i) {
|
2014-06-25 00:50:20 +04:00
|
|
|
AddFeatures(SparseReorderingFeatureKey::Between,
|
2014-06-24 16:37:54 +04:00
|
|
|
SparseReorderingFeatureKey::Source, sentence.GetWord(i),
|
2014-06-25 00:50:20 +04:00
|
|
|
SparseReorderingFeatureKey::First, reoType, scores);
|
2014-06-24 16:37:54 +04:00
|
|
|
}
|
|
|
|
}
|
2014-06-07 00:08:09 +04:00
|
|
|
//std::cerr << "SR " << topt << " " << reoType << " " << direction << std::endl;
|
2014-06-13 00:37:18 +04:00
|
|
|
//phrase (backward)
|
|
|
|
//stack (forward)
|
|
|
|
SparseReorderingFeatureKey::Type type;
|
2014-06-10 01:17:05 +04:00
|
|
|
if (direction == LexicalReorderingConfiguration::Forward) {
|
|
|
|
if (!m_useStack) return;
|
2014-06-13 00:37:18 +04:00
|
|
|
type = SparseReorderingFeatureKey::Stack;
|
2014-06-10 13:23:48 +04:00
|
|
|
} else if (direction == LexicalReorderingConfiguration::Backward) {
|
|
|
|
if (!m_usePhrase) return;
|
2014-06-13 00:37:18 +04:00
|
|
|
type = SparseReorderingFeatureKey::Phrase;
|
2014-06-10 13:23:48 +04:00
|
|
|
} else {
|
|
|
|
//Shouldn't be called for bidirectional
|
2014-06-13 00:37:18 +04:00
|
|
|
//keep compiler happy
|
|
|
|
type = SparseReorderingFeatureKey::Phrase;
|
2014-06-10 13:23:48 +04:00
|
|
|
assert(!"Shouldn't call CopyScores() with bidirectional direction");
|
2014-06-10 01:17:05 +04:00
|
|
|
}
|
2014-06-25 00:50:20 +04:00
|
|
|
const Phrase& sourcePhrase = currentOpt.GetInputPath().GetPhrase();
|
|
|
|
AddFeatures(type, SparseReorderingFeatureKey::Source, sourcePhrase.GetWord(0),
|
|
|
|
SparseReorderingFeatureKey::First, reoType, scores);
|
|
|
|
AddFeatures(type, SparseReorderingFeatureKey::Source, sourcePhrase.GetWord(sourcePhrase.GetSize()-1), SparseReorderingFeatureKey::Last, reoType, scores);
|
|
|
|
const Phrase& targetPhrase = currentOpt.GetTargetPhrase();
|
|
|
|
AddFeatures(type, SparseReorderingFeatureKey::Target, targetPhrase.GetWord(0),
|
|
|
|
SparseReorderingFeatureKey::First, reoType, scores);
|
|
|
|
AddFeatures(type, SparseReorderingFeatureKey::Target, targetPhrase.GetWord(targetPhrase.GetSize()-1), SparseReorderingFeatureKey::Last, reoType, scores);
|
2014-06-10 01:17:05 +04:00
|
|
|
|
|
|
|
|
2014-05-27 14:05:56 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
} //namespace
|
|
|
|
|