mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-08-16 15:00:33 +03:00
beautify
This commit is contained in:
parent
d85bd05e18
commit
2f3cd5e2fe
@ -45,9 +45,9 @@ public:
|
||||
typedef MapType::iterator iterator;
|
||||
|
||||
ChartCellLabelSet(const WordsRange &coverage)
|
||||
: m_coverage(coverage)
|
||||
, m_map(FactorCollection::Instance().GetNumNonTerminals(), NULL)
|
||||
, m_size(0) { }
|
||||
: m_coverage(coverage)
|
||||
, m_map(FactorCollection::Instance().GetNumNonTerminals(), NULL)
|
||||
, m_size(0) { }
|
||||
|
||||
~ChartCellLabelSet() {
|
||||
RemoveAllInColl(m_map);
|
||||
@ -82,8 +82,7 @@ public:
|
||||
if (ChartCellExists(idx)) {
|
||||
ChartCellLabel::Stack & s = m_map[idx]->MutableStack();
|
||||
s.cube = stack;
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
ChartCellLabel::Stack s;
|
||||
s.cube = stack;
|
||||
m_size++;
|
||||
@ -97,8 +96,7 @@ public:
|
||||
if (m_map.at(idx) != NULL) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
catch (const std::out_of_range& oor) {
|
||||
} catch (const std::out_of_range& oor) {
|
||||
m_map.resize(FactorCollection::Instance().GetNumNonTerminals(), NULL);
|
||||
}
|
||||
return false;
|
||||
@ -116,8 +114,7 @@ public:
|
||||
size_t idx = w[0]->GetId();
|
||||
try {
|
||||
return m_map.at(idx);
|
||||
}
|
||||
catch (const std::out_of_range& oor) {
|
||||
} catch (const std::out_of_range& oor) {
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
|
@ -32,8 +32,8 @@ namespace Moses
|
||||
|
||||
// Extract the k-best list from the search graph.
|
||||
void ChartKBestExtractor::Extract(
|
||||
const std::vector<const ChartHypothesis*> &topLevelHypos, std::size_t k,
|
||||
KBestVec &kBestList)
|
||||
const std::vector<const ChartHypothesis*> &topLevelHypos, std::size_t k,
|
||||
KBestVec &kBestList)
|
||||
{
|
||||
kBestList.clear();
|
||||
if (topLevelHypos.empty()) {
|
||||
@ -45,7 +45,7 @@ void ChartKBestExtractor::Extract(
|
||||
std::vector<const ChartHypothesis*>::const_iterator p = topLevelHypos.begin();
|
||||
const ChartHypothesis &bestTopLevelHypo = **p;
|
||||
boost::scoped_ptr<ChartHypothesis> supremeHypo(
|
||||
new ChartHypothesis(bestTopLevelHypo, *this));
|
||||
new ChartHypothesis(bestTopLevelHypo, *this));
|
||||
|
||||
// Do the same for each alternative top-level hypothesis, but add the new
|
||||
// ChartHypothesis objects as arcs from supremeHypo, as if they had been
|
||||
@ -68,8 +68,8 @@ void ChartKBestExtractor::Extract(
|
||||
// each derivation.
|
||||
kBestList.reserve(targetVertex->kBestList.size());
|
||||
for (std::vector<boost::weak_ptr<Derivation> >::const_iterator
|
||||
q = targetVertex->kBestList.begin();
|
||||
q != targetVertex->kBestList.end(); ++q) {
|
||||
q = targetVertex->kBestList.begin();
|
||||
q != targetVertex->kBestList.end(); ++q) {
|
||||
const boost::shared_ptr<Derivation> d(*q);
|
||||
assert(d);
|
||||
assert(d->subderivations.size() == 1);
|
||||
@ -124,7 +124,7 @@ Phrase ChartKBestExtractor::GetOutputPhrase(const Derivation &d)
|
||||
|
||||
// Create an unweighted hyperarc corresponding to the given ChartHypothesis.
|
||||
ChartKBestExtractor::UnweightedHyperarc ChartKBestExtractor::CreateEdge(
|
||||
const ChartHypothesis &h)
|
||||
const ChartHypothesis &h)
|
||||
{
|
||||
UnweightedHyperarc edge;
|
||||
edge.head = FindOrCreateVertex(h);
|
||||
|
@ -70,8 +70,8 @@ public:
|
||||
|
||||
struct Vertex {
|
||||
typedef std::priority_queue<boost::weak_ptr<Derivation>,
|
||||
std::vector<boost::weak_ptr<Derivation> >,
|
||||
DerivationOrderer> DerivationQueue;
|
||||
std::vector<boost::weak_ptr<Derivation> >,
|
||||
DerivationOrderer> DerivationQueue;
|
||||
|
||||
Vertex(const ChartHypothesis &h) : hypothesis(h), visited(false) {}
|
||||
|
||||
@ -92,7 +92,7 @@ public:
|
||||
|
||||
private:
|
||||
typedef boost::unordered_map<const ChartHypothesis *,
|
||||
boost::shared_ptr<Vertex> > VertexMap;
|
||||
boost::shared_ptr<Vertex> > VertexMap;
|
||||
|
||||
struct DerivationHasher {
|
||||
std::size_t operator()(const boost::shared_ptr<Derivation> &d) const {
|
||||
@ -114,7 +114,7 @@ private:
|
||||
};
|
||||
|
||||
typedef boost::unordered_set<boost::shared_ptr<Derivation>, DerivationHasher,
|
||||
DerivationEqualityPred> DerivationSet;
|
||||
DerivationEqualityPred> DerivationSet;
|
||||
|
||||
UnweightedHyperarc CreateEdge(const ChartHypothesis &);
|
||||
boost::shared_ptr<Vertex> FindOrCreateVertex(const ChartHypothesis &);
|
||||
|
@ -269,9 +269,9 @@ void ChartManager::CalcNBest(size_t count, ChartTrellisPathList &ret,bool onlyDi
|
||||
* \param onlyDistinct whether to check for distinct output sentence or not (default - don't check, just return top n-paths)
|
||||
*/
|
||||
void ChartManager::CalcNBest(
|
||||
std::size_t n,
|
||||
std::vector<boost::shared_ptr<ChartKBestExtractor::Derivation> > &nBestList,
|
||||
bool onlyDistinct) const
|
||||
std::size_t n,
|
||||
std::vector<boost::shared_ptr<ChartKBestExtractor::Derivation> > &nBestList,
|
||||
bool onlyDistinct) const
|
||||
{
|
||||
nBestList.clear();
|
||||
if (n == 0 || m_source.GetSize() == 0) {
|
||||
@ -282,7 +282,7 @@ void ChartManager::CalcNBest(
|
||||
WordsRange range(0, m_source.GetSize()-1);
|
||||
const ChartCell &lastCell = m_hypoStackColl.Get(range);
|
||||
boost::scoped_ptr<const std::vector<const ChartHypothesis*> > topLevelHypos(
|
||||
lastCell.GetAllSortedHypotheses());
|
||||
lastCell.GetAllSortedHypotheses());
|
||||
if (!topLevelHypos) {
|
||||
return;
|
||||
}
|
||||
|
@ -108,7 +108,9 @@ public:
|
||||
return m_hypothesisId++;
|
||||
}
|
||||
|
||||
const ChartParser &GetParser() const { return m_parser; }
|
||||
const ChartParser &GetParser() const {
|
||||
return m_parser;
|
||||
}
|
||||
};
|
||||
|
||||
}
|
||||
|
@ -183,7 +183,7 @@ void ChartParser::Create(const WordsRange &wordsRange, ChartParserCallback &to)
|
||||
size_t maxSpan = decodeGraph.GetMaxChartSpan();
|
||||
size_t last = m_source.GetSize()-1;
|
||||
if (maxSpan != 0) {
|
||||
last = min(last, wordsRange.GetStartPos()+maxSpan);
|
||||
last = min(last, wordsRange.GetStartPos()+maxSpan);
|
||||
}
|
||||
if (maxSpan == 0 || wordsRange.GetNumWordsCovered() <= maxSpan) {
|
||||
ruleLookupManager.GetChartRuleCollection(wordsRange, last, to);
|
||||
|
@ -48,7 +48,9 @@ public:
|
||||
|
||||
void Process(const Word &sourceWord, const WordsRange &range, ChartParserCallback &to);
|
||||
|
||||
const std::vector<Phrase*> &GetUnknownSources() const { return m_unksrcs; }
|
||||
const std::vector<Phrase*> &GetUnknownSources() const {
|
||||
return m_unksrcs;
|
||||
}
|
||||
|
||||
private:
|
||||
std::vector<Phrase*> m_unksrcs;
|
||||
@ -69,7 +71,9 @@ public:
|
||||
size_t GetSize() const;
|
||||
const InputPath &GetInputPath(size_t startPos, size_t endPos) const;
|
||||
const InputPath &GetInputPath(WordsRange &range) const;
|
||||
const std::vector<Phrase*> &GetUnknownSources() const { return m_unknown.GetUnknownSources(); }
|
||||
const std::vector<Phrase*> &GetUnknownSources() const {
|
||||
return m_unknown.GetUnknownSources();
|
||||
}
|
||||
|
||||
private:
|
||||
ChartParserUnknown m_unknown;
|
||||
|
@ -161,11 +161,11 @@ void ChartTranslationOptionList::ApplyThreshold()
|
||||
|
||||
float ChartTranslationOptionList::GetBestScore(const ChartCellLabel *chartCell) const
|
||||
{
|
||||
const HypoList *stack = chartCell->GetStack().cube;
|
||||
assert(stack);
|
||||
assert(!stack->empty());
|
||||
const ChartHypothesis &bestHypo = **(stack->begin());
|
||||
return bestHypo.GetTotalScore();
|
||||
const HypoList *stack = chartCell->GetStack().cube;
|
||||
assert(stack);
|
||||
assert(!stack->empty());
|
||||
const ChartHypothesis &bestHypo = **(stack->begin());
|
||||
return bestHypo.GetTotalScore();
|
||||
}
|
||||
|
||||
void ChartTranslationOptionList::Evaluate(const InputType &input, const InputPath &inputPath)
|
||||
|
@ -14,299 +14,299 @@
|
||||
|
||||
namespace Moses
|
||||
{
|
||||
struct CNStats {
|
||||
size_t created,destr,read,colls,words;
|
||||
|
||||
CNStats() : created(0),destr(0),read(0),colls(0),words(0) {}
|
||||
~CNStats() {
|
||||
print(std::cerr);
|
||||
}
|
||||
struct CNStats {
|
||||
size_t created,destr,read,colls,words;
|
||||
|
||||
void createOne() {
|
||||
++created;
|
||||
}
|
||||
void destroyOne() {
|
||||
++destr;
|
||||
}
|
||||
|
||||
void collect(const ConfusionNet& cn) {
|
||||
++read;
|
||||
colls+=cn.GetSize();
|
||||
for(size_t i=0; i<cn.GetSize(); ++i)
|
||||
words+=cn[i].size();
|
||||
}
|
||||
void print(std::ostream& out) const {
|
||||
if(created>0) {
|
||||
out<<"confusion net statistics:\n"
|
||||
" created:\t"<<created<<"\n"
|
||||
" destroyed:\t"<<destr<<"\n"
|
||||
" succ. read:\t"<<read<<"\n"
|
||||
" columns:\t"<<colls<<"\n"
|
||||
" words:\t"<<words<<"\n"
|
||||
" avg. word/column:\t"<<words/(1.0*colls)<<"\n"
|
||||
" avg. cols/sent:\t"<<colls/(1.0*read)<<"\n"
|
||||
"\n\n";
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
CNStats stats;
|
||||
|
||||
size_t
|
||||
ConfusionNet::
|
||||
GetColumnIncrement(size_t i, size_t j) const
|
||||
{
|
||||
(void) i;
|
||||
(void) j;
|
||||
return 1;
|
||||
CNStats() : created(0),destr(0),read(0),colls(0),words(0) {}
|
||||
~CNStats() {
|
||||
print(std::cerr);
|
||||
}
|
||||
|
||||
ConfusionNet::
|
||||
ConfusionNet()
|
||||
: InputType()
|
||||
{
|
||||
stats.createOne();
|
||||
|
||||
const StaticData& staticData = StaticData::Instance();
|
||||
if (staticData.IsChart()) {
|
||||
m_defaultLabelSet.insert(StaticData::Instance().GetInputDefaultNonTerminal());
|
||||
}
|
||||
UTIL_THROW_IF2(&InputFeature::Instance() == NULL, "Input feature must be specified");
|
||||
void createOne() {
|
||||
++created;
|
||||
}
|
||||
void destroyOne() {
|
||||
++destr;
|
||||
}
|
||||
|
||||
ConfusionNet::
|
||||
~ConfusionNet()
|
||||
{
|
||||
stats.destroyOne();
|
||||
void collect(const ConfusionNet& cn) {
|
||||
++read;
|
||||
colls+=cn.GetSize();
|
||||
for(size_t i=0; i<cn.GetSize(); ++i)
|
||||
words+=cn[i].size();
|
||||
}
|
||||
|
||||
ConfusionNet::
|
||||
ConfusionNet(Sentence const& s)
|
||||
{
|
||||
data.resize(s.GetSize());
|
||||
for(size_t i=0; i<s.GetSize(); ++i) {
|
||||
ScorePair scorePair;
|
||||
std::pair<Word, ScorePair > temp = std::make_pair(s.GetWord(i), scorePair);
|
||||
data[i].push_back(temp);
|
||||
void print(std::ostream& out) const {
|
||||
if(created>0) {
|
||||
out<<"confusion net statistics:\n"
|
||||
" created:\t"<<created<<"\n"
|
||||
" destroyed:\t"<<destr<<"\n"
|
||||
" succ. read:\t"<<read<<"\n"
|
||||
" columns:\t"<<colls<<"\n"
|
||||
" words:\t"<<words<<"\n"
|
||||
" avg. word/column:\t"<<words/(1.0*colls)<<"\n"
|
||||
" avg. cols/sent:\t"<<colls/(1.0*read)<<"\n"
|
||||
"\n\n";
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
bool
|
||||
ConfusionNet::
|
||||
ReadF(std::istream& in, const std::vector<FactorType>& factorOrder, int format)
|
||||
{
|
||||
VERBOSE(2, "read confusion net with format "<<format<<"\n");
|
||||
switch(format) {
|
||||
case 0:
|
||||
return ReadFormat0(in,factorOrder);
|
||||
case 1:
|
||||
return ReadFormat1(in,factorOrder);
|
||||
default:
|
||||
std::stringstream strme;
|
||||
strme << "ERROR: unknown format '"<<format
|
||||
<<"' in ConfusionNet::Read";
|
||||
UserMessage::Add(strme.str());
|
||||
}
|
||||
return false;
|
||||
}
|
||||
CNStats stats;
|
||||
|
||||
int
|
||||
ConfusionNet::
|
||||
Read(std::istream& in,
|
||||
const std::vector<FactorType>& factorOrder)
|
||||
{
|
||||
int rv=ReadF(in,factorOrder,0);
|
||||
if(rv) stats.collect(*this);
|
||||
return rv;
|
||||
size_t
|
||||
ConfusionNet::
|
||||
GetColumnIncrement(size_t i, size_t j) const
|
||||
{
|
||||
(void) i;
|
||||
(void) j;
|
||||
return 1;
|
||||
}
|
||||
|
||||
ConfusionNet::
|
||||
ConfusionNet()
|
||||
: InputType()
|
||||
{
|
||||
stats.createOne();
|
||||
|
||||
const StaticData& staticData = StaticData::Instance();
|
||||
if (staticData.IsChart()) {
|
||||
m_defaultLabelSet.insert(StaticData::Instance().GetInputDefaultNonTerminal());
|
||||
}
|
||||
UTIL_THROW_IF2(&InputFeature::Instance() == NULL, "Input feature must be specified");
|
||||
}
|
||||
|
||||
ConfusionNet::
|
||||
~ConfusionNet()
|
||||
{
|
||||
stats.destroyOne();
|
||||
}
|
||||
|
||||
ConfusionNet::
|
||||
ConfusionNet(Sentence const& s)
|
||||
{
|
||||
data.resize(s.GetSize());
|
||||
for(size_t i=0; i<s.GetSize(); ++i) {
|
||||
ScorePair scorePair;
|
||||
std::pair<Word, ScorePair > temp = std::make_pair(s.GetWord(i), scorePair);
|
||||
data[i].push_back(temp);
|
||||
}
|
||||
}
|
||||
|
||||
bool
|
||||
ConfusionNet::
|
||||
ReadF(std::istream& in, const std::vector<FactorType>& factorOrder, int format)
|
||||
{
|
||||
VERBOSE(2, "read confusion net with format "<<format<<"\n");
|
||||
switch(format) {
|
||||
case 0:
|
||||
return ReadFormat0(in,factorOrder);
|
||||
case 1:
|
||||
return ReadFormat1(in,factorOrder);
|
||||
default:
|
||||
std::stringstream strme;
|
||||
strme << "ERROR: unknown format '"<<format
|
||||
<<"' in ConfusionNet::Read";
|
||||
UserMessage::Add(strme.str());
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
int
|
||||
ConfusionNet::
|
||||
Read(std::istream& in,
|
||||
const std::vector<FactorType>& factorOrder)
|
||||
{
|
||||
int rv=ReadF(in,factorOrder,0);
|
||||
if(rv) stats.collect(*this);
|
||||
return rv;
|
||||
}
|
||||
|
||||
#if 0
|
||||
// Deprecated due to code duplication;
|
||||
// use Word::CreateFromString() instead
|
||||
void
|
||||
ConfusionNet::
|
||||
String2Word(const std::string& s,Word& w,
|
||||
const std::vector<FactorType>& factorOrder)
|
||||
{
|
||||
std::vector<std::string> factorStrVector = Tokenize(s, "|");
|
||||
for(size_t i=0; i<factorOrder.size(); ++i)
|
||||
w.SetFactor(factorOrder[i],
|
||||
FactorCollection::Instance().AddFactor
|
||||
(Input,factorOrder[i], factorStrVector[i]));
|
||||
}
|
||||
// Deprecated due to code duplication;
|
||||
// use Word::CreateFromString() instead
|
||||
void
|
||||
ConfusionNet::
|
||||
String2Word(const std::string& s,Word& w,
|
||||
const std::vector<FactorType>& factorOrder)
|
||||
{
|
||||
std::vector<std::string> factorStrVector = Tokenize(s, "|");
|
||||
for(size_t i=0; i<factorOrder.size(); ++i)
|
||||
w.SetFactor(factorOrder[i],
|
||||
FactorCollection::Instance().AddFactor
|
||||
(Input,factorOrder[i], factorStrVector[i]));
|
||||
}
|
||||
#endif
|
||||
|
||||
bool
|
||||
ConfusionNet::
|
||||
ReadFormat0(std::istream& in, const std::vector<FactorType>& factorOrder)
|
||||
{
|
||||
Clear();
|
||||
bool
|
||||
ConfusionNet::
|
||||
ReadFormat0(std::istream& in, const std::vector<FactorType>& factorOrder)
|
||||
{
|
||||
Clear();
|
||||
|
||||
const StaticData &staticData = StaticData::Instance();
|
||||
const InputFeature &inputFeature = InputFeature::Instance();
|
||||
size_t numInputScores = inputFeature.GetNumInputScores();
|
||||
size_t numRealWordCount = inputFeature.GetNumRealWordsInInput();
|
||||
const StaticData &staticData = StaticData::Instance();
|
||||
const InputFeature &inputFeature = InputFeature::Instance();
|
||||
size_t numInputScores = inputFeature.GetNumInputScores();
|
||||
size_t numRealWordCount = inputFeature.GetNumRealWordsInInput();
|
||||
|
||||
size_t totalCount = numInputScores + numRealWordCount;
|
||||
bool addRealWordCount = (numRealWordCount > 0);
|
||||
size_t totalCount = numInputScores + numRealWordCount;
|
||||
bool addRealWordCount = (numRealWordCount > 0);
|
||||
|
||||
std::string line;
|
||||
while(getline(in,line)) {
|
||||
std::istringstream is(line);
|
||||
std::string word;
|
||||
std::string line;
|
||||
while(getline(in,line)) {
|
||||
std::istringstream is(line);
|
||||
std::string word;
|
||||
|
||||
Column col;
|
||||
while(is>>word) {
|
||||
Word w;
|
||||
// String2Word(word,w,factorOrder);
|
||||
w.CreateFromString(Input,factorOrder,StringPiece(word),false,false);
|
||||
std::vector<float> probs(totalCount, 0.0);
|
||||
for(size_t i=0; i < numInputScores; i++) {
|
||||
double prob;
|
||||
if (!(is>>prob)) {
|
||||
TRACE_ERR("ERROR: unable to parse CN input - bad link probability, or wrong number of scores\n");
|
||||
return false;
|
||||
}
|
||||
if(prob<0.0) {
|
||||
VERBOSE(1, "WARN: negative prob: "<<prob<<" ->set to 0.0\n");
|
||||
prob=0.0;
|
||||
} else if (prob>1.0) {
|
||||
VERBOSE(1, "WARN: prob > 1.0 : "<<prob<<" -> set to 1.0\n");
|
||||
prob=1.0;
|
||||
}
|
||||
probs[i] = (std::max(static_cast<float>(log(prob)),LOWEST_SCORE));
|
||||
Column col;
|
||||
while(is>>word) {
|
||||
Word w;
|
||||
// String2Word(word,w,factorOrder);
|
||||
w.CreateFromString(Input,factorOrder,StringPiece(word),false,false);
|
||||
std::vector<float> probs(totalCount, 0.0);
|
||||
for(size_t i=0; i < numInputScores; i++) {
|
||||
double prob;
|
||||
if (!(is>>prob)) {
|
||||
TRACE_ERR("ERROR: unable to parse CN input - bad link probability, or wrong number of scores\n");
|
||||
return false;
|
||||
}
|
||||
if(prob<0.0) {
|
||||
VERBOSE(1, "WARN: negative prob: "<<prob<<" ->set to 0.0\n");
|
||||
prob=0.0;
|
||||
} else if (prob>1.0) {
|
||||
VERBOSE(1, "WARN: prob > 1.0 : "<<prob<<" -> set to 1.0\n");
|
||||
prob=1.0;
|
||||
}
|
||||
probs[i] = (std::max(static_cast<float>(log(prob)),LOWEST_SCORE));
|
||||
|
||||
}
|
||||
//store 'real' word count in last feature if we have one more weight than we do arc scores and not epsilon
|
||||
if (addRealWordCount && word!=EPSILON && word!="")
|
||||
probs.back() = -1.0;
|
||||
|
||||
ScorePair scorePair(probs);
|
||||
|
||||
col.push_back(std::make_pair(w,scorePair));
|
||||
}
|
||||
if(col.size()) {
|
||||
data.push_back(col);
|
||||
ShrinkToFit(data.back());
|
||||
} else break;
|
||||
}
|
||||
return !data.empty();
|
||||
}
|
||||
//store 'real' word count in last feature if we have one more weight than we do arc scores and not epsilon
|
||||
if (addRealWordCount && word!=EPSILON && word!="")
|
||||
probs.back() = -1.0;
|
||||
|
||||
bool
|
||||
ConfusionNet::
|
||||
ReadFormat1(std::istream& in, const std::vector<FactorType>& factorOrder)
|
||||
{
|
||||
Clear();
|
||||
std::string line;
|
||||
ScorePair scorePair(probs);
|
||||
|
||||
col.push_back(std::make_pair(w,scorePair));
|
||||
}
|
||||
if(col.size()) {
|
||||
data.push_back(col);
|
||||
ShrinkToFit(data.back());
|
||||
} else break;
|
||||
}
|
||||
return !data.empty();
|
||||
}
|
||||
|
||||
bool
|
||||
ConfusionNet::
|
||||
ReadFormat1(std::istream& in, const std::vector<FactorType>& factorOrder)
|
||||
{
|
||||
Clear();
|
||||
std::string line;
|
||||
if(!getline(in,line)) return 0;
|
||||
size_t s;
|
||||
if(getline(in,line)) s=atoi(line.c_str());
|
||||
else return 0;
|
||||
data.resize(s);
|
||||
for(size_t i=0; i<data.size(); ++i) {
|
||||
if(!getline(in,line)) return 0;
|
||||
size_t s;
|
||||
if(getline(in,line)) s=atoi(line.c_str());
|
||||
else return 0;
|
||||
data.resize(s);
|
||||
for(size_t i=0; i<data.size(); ++i) {
|
||||
if(!getline(in,line)) return 0;
|
||||
std::istringstream is(line);
|
||||
if(!(is>>s)) return 0;
|
||||
std::string word;
|
||||
double prob;
|
||||
data[i].resize(s);
|
||||
for(size_t j=0; j<s; ++j)
|
||||
if(is>>word>>prob) {
|
||||
//TODO: we are only reading one prob from this input format, should read many... but this function is unused anyway. -JS
|
||||
data[i][j].second.denseScores = std::vector<float> (1);
|
||||
data[i][j].second.denseScores.push_back((float) log(prob));
|
||||
if(data[i][j].second.denseScores[0]<0) {
|
||||
VERBOSE(1, "WARN: neg costs: "<<data[i][j].second.denseScores[0]<<" -> set to 0\n");
|
||||
data[i][j].second.denseScores[0]=0.0;
|
||||
}
|
||||
// String2Word(word,data[i][j].first,factorOrder);
|
||||
Word& w = data[i][j].first;
|
||||
w.CreateFromString(Input,factorOrder,StringPiece(word),false,false);
|
||||
} else return 0;
|
||||
}
|
||||
return !data.empty();
|
||||
std::istringstream is(line);
|
||||
if(!(is>>s)) return 0;
|
||||
std::string word;
|
||||
double prob;
|
||||
data[i].resize(s);
|
||||
for(size_t j=0; j<s; ++j)
|
||||
if(is>>word>>prob) {
|
||||
//TODO: we are only reading one prob from this input format, should read many... but this function is unused anyway. -JS
|
||||
data[i][j].second.denseScores = std::vector<float> (1);
|
||||
data[i][j].second.denseScores.push_back((float) log(prob));
|
||||
if(data[i][j].second.denseScores[0]<0) {
|
||||
VERBOSE(1, "WARN: neg costs: "<<data[i][j].second.denseScores[0]<<" -> set to 0\n");
|
||||
data[i][j].second.denseScores[0]=0.0;
|
||||
}
|
||||
// String2Word(word,data[i][j].first,factorOrder);
|
||||
Word& w = data[i][j].first;
|
||||
w.CreateFromString(Input,factorOrder,StringPiece(word),false,false);
|
||||
} else return 0;
|
||||
}
|
||||
return !data.empty();
|
||||
}
|
||||
|
||||
void ConfusionNet::Print(std::ostream& out) const
|
||||
{
|
||||
out<<"conf net: "<<data.size()<<"\n";
|
||||
for(size_t i=0; i<data.size(); ++i) {
|
||||
out<<i<<" -- ";
|
||||
for(size_t j=0; j<data[i].size(); ++j) {
|
||||
out<<"("<<data[i][j].first.ToString()<<", ";
|
||||
void ConfusionNet::Print(std::ostream& out) const
|
||||
{
|
||||
out<<"conf net: "<<data.size()<<"\n";
|
||||
for(size_t i=0; i<data.size(); ++i) {
|
||||
out<<i<<" -- ";
|
||||
for(size_t j=0; j<data[i].size(); ++j) {
|
||||
out<<"("<<data[i][j].first.ToString()<<", ";
|
||||
|
||||
// dense
|
||||
std::vector<float>::const_iterator iterDense;
|
||||
for(iterDense = data[i][j].second.denseScores.begin();
|
||||
iterDense < data[i][j].second.denseScores.end();
|
||||
++iterDense) {
|
||||
out<<", "<<*iterDense;
|
||||
}
|
||||
|
||||
// sparse
|
||||
std::map<StringPiece, float>::const_iterator iterSparse;
|
||||
for(iterSparse = data[i][j].second.sparseScores.begin();
|
||||
iterSparse != data[i][j].second.sparseScores.end();
|
||||
++iterSparse) {
|
||||
out << ", " << iterSparse->first << "=" << iterSparse->second;
|
||||
}
|
||||
|
||||
out<<") ";
|
||||
// dense
|
||||
std::vector<float>::const_iterator iterDense;
|
||||
for(iterDense = data[i][j].second.denseScores.begin();
|
||||
iterDense < data[i][j].second.denseScores.end();
|
||||
++iterDense) {
|
||||
out<<", "<<*iterDense;
|
||||
}
|
||||
out<<"\n";
|
||||
|
||||
// sparse
|
||||
std::map<StringPiece, float>::const_iterator iterSparse;
|
||||
for(iterSparse = data[i][j].second.sparseScores.begin();
|
||||
iterSparse != data[i][j].second.sparseScores.end();
|
||||
++iterSparse) {
|
||||
out << ", " << iterSparse->first << "=" << iterSparse->second;
|
||||
}
|
||||
|
||||
out<<") ";
|
||||
}
|
||||
out<<"\n\n";
|
||||
out<<"\n";
|
||||
}
|
||||
out<<"\n\n";
|
||||
}
|
||||
|
||||
#ifdef _WIN32
|
||||
#pragma warning(disable:4716)
|
||||
#endif
|
||||
Phrase
|
||||
ConfusionNet::
|
||||
GetSubString(const WordsRange&) const
|
||||
{
|
||||
UTIL_THROW2("ERROR: call to ConfusionNet::GetSubString\n");
|
||||
//return Phrase(Input);
|
||||
}
|
||||
Phrase
|
||||
ConfusionNet::
|
||||
GetSubString(const WordsRange&) const
|
||||
{
|
||||
UTIL_THROW2("ERROR: call to ConfusionNet::GetSubString\n");
|
||||
//return Phrase(Input);
|
||||
}
|
||||
|
||||
std::string
|
||||
ConfusionNet::
|
||||
GetStringRep(const std::vector<FactorType> /* factorsToPrint */) const //not well defined yet
|
||||
{
|
||||
TRACE_ERR("ERROR: call to ConfusionNet::GeStringRep\n");
|
||||
return "";
|
||||
}
|
||||
std::string
|
||||
ConfusionNet::
|
||||
GetStringRep(const std::vector<FactorType> /* factorsToPrint */) const //not well defined yet
|
||||
{
|
||||
TRACE_ERR("ERROR: call to ConfusionNet::GeStringRep\n");
|
||||
return "";
|
||||
}
|
||||
#ifdef _WIN32
|
||||
#pragma warning(disable:4716)
|
||||
#endif
|
||||
const Word& ConfusionNet::GetWord(size_t) const
|
||||
{
|
||||
UTIL_THROW2("ERROR: call to ConfusionNet::GetFactorArray\n");
|
||||
}
|
||||
const Word& ConfusionNet::GetWord(size_t) const
|
||||
{
|
||||
UTIL_THROW2("ERROR: call to ConfusionNet::GetFactorArray\n");
|
||||
}
|
||||
#ifdef _WIN32
|
||||
#pragma warning(default:4716)
|
||||
#endif
|
||||
std::ostream& operator<<(std::ostream& out,const ConfusionNet& cn)
|
||||
{
|
||||
cn.Print(out);
|
||||
return out;
|
||||
}
|
||||
std::ostream& operator<<(std::ostream& out,const ConfusionNet& cn)
|
||||
{
|
||||
cn.Print(out);
|
||||
return out;
|
||||
}
|
||||
|
||||
TranslationOptionCollection*
|
||||
ConfusionNet::
|
||||
CreateTranslationOptionCollection() const
|
||||
{
|
||||
size_t maxNoTransOptPerCoverage
|
||||
= StaticData::Instance().GetMaxNoTransOptPerCoverage();
|
||||
float translationOptionThreshold
|
||||
= StaticData::Instance().GetTranslationOptionThreshold();
|
||||
TranslationOptionCollection *rv
|
||||
= new TranslationOptionCollectionConfusionNet
|
||||
(*this, maxNoTransOptPerCoverage, translationOptionThreshold);
|
||||
assert(rv);
|
||||
return rv;
|
||||
}
|
||||
TranslationOptionCollection*
|
||||
ConfusionNet::
|
||||
CreateTranslationOptionCollection() const
|
||||
{
|
||||
size_t maxNoTransOptPerCoverage
|
||||
= StaticData::Instance().GetMaxNoTransOptPerCoverage();
|
||||
float translationOptionThreshold
|
||||
= StaticData::Instance().GetTranslationOptionThreshold();
|
||||
TranslationOptionCollection *rv
|
||||
= new TranslationOptionCollectionConfusionNet
|
||||
(*this, maxNoTransOptPerCoverage, translationOptionThreshold);
|
||||
assert(rv);
|
||||
return rv;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
@ -8,18 +8,18 @@ using namespace std;
|
||||
namespace Moses
|
||||
{
|
||||
CountNonTerms::CountNonTerms(const std::string &line)
|
||||
:StatelessFeatureFunction(line)
|
||||
,m_all(true)
|
||||
,m_sourceSyntax(false)
|
||||
,m_targetSyntax(false)
|
||||
:StatelessFeatureFunction(line)
|
||||
,m_all(true)
|
||||
,m_sourceSyntax(false)
|
||||
,m_targetSyntax(false)
|
||||
{
|
||||
ReadParameters();
|
||||
}
|
||||
|
||||
void CountNonTerms::Evaluate(const Phrase &sourcePhrase
|
||||
, const TargetPhrase &targetPhrase
|
||||
, ScoreComponentCollection &scoreBreakdown
|
||||
, ScoreComponentCollection &estimatedFutureScore) const
|
||||
, const TargetPhrase &targetPhrase
|
||||
, ScoreComponentCollection &scoreBreakdown
|
||||
, ScoreComponentCollection &estimatedFutureScore) const
|
||||
{
|
||||
const StaticData &staticData = StaticData::Instance();
|
||||
|
||||
@ -27,33 +27,33 @@ void CountNonTerms::Evaluate(const Phrase &sourcePhrase
|
||||
size_t indScore = 0;
|
||||
|
||||
if (m_all) {
|
||||
for (size_t i = 0; i < targetPhrase.GetSize(); ++i) {
|
||||
const Word &word = targetPhrase.GetWord(i);
|
||||
if (word.IsNonTerminal()) {
|
||||
++scores[indScore];
|
||||
}
|
||||
}
|
||||
++indScore;
|
||||
for (size_t i = 0; i < targetPhrase.GetSize(); ++i) {
|
||||
const Word &word = targetPhrase.GetWord(i);
|
||||
if (word.IsNonTerminal()) {
|
||||
++scores[indScore];
|
||||
}
|
||||
}
|
||||
++indScore;
|
||||
}
|
||||
|
||||
if (m_targetSyntax) {
|
||||
for (size_t i = 0; i < targetPhrase.GetSize(); ++i) {
|
||||
const Word &word = targetPhrase.GetWord(i);
|
||||
if (word.IsNonTerminal() && word != staticData.GetOutputDefaultNonTerminal()) {
|
||||
++scores[indScore];
|
||||
}
|
||||
}
|
||||
++indScore;
|
||||
for (size_t i = 0; i < targetPhrase.GetSize(); ++i) {
|
||||
const Word &word = targetPhrase.GetWord(i);
|
||||
if (word.IsNonTerminal() && word != staticData.GetOutputDefaultNonTerminal()) {
|
||||
++scores[indScore];
|
||||
}
|
||||
}
|
||||
++indScore;
|
||||
}
|
||||
|
||||
if (m_sourceSyntax) {
|
||||
for (size_t i = 0; i < sourcePhrase.GetSize(); ++i) {
|
||||
const Word &word = sourcePhrase.GetWord(i);
|
||||
if (word.IsNonTerminal() && word != staticData.GetInputDefaultNonTerminal()) {
|
||||
++scores[indScore];
|
||||
}
|
||||
}
|
||||
++indScore;
|
||||
for (size_t i = 0; i < sourcePhrase.GetSize(); ++i) {
|
||||
const Word &word = sourcePhrase.GetWord(i);
|
||||
if (word.IsNonTerminal() && word != staticData.GetInputDefaultNonTerminal()) {
|
||||
++scores[indScore];
|
||||
}
|
||||
}
|
||||
++indScore;
|
||||
}
|
||||
|
||||
scoreBreakdown.PlusEquals(this, scores);
|
||||
@ -64,9 +64,9 @@ void CountNonTerms::SetParameter(const std::string& key, const std::string& valu
|
||||
if (key == "all") {
|
||||
m_all = Scan<bool>(value);
|
||||
} else if (key == "source-syntax") {
|
||||
m_sourceSyntax = Scan<bool>(value);
|
||||
m_sourceSyntax = Scan<bool>(value);
|
||||
} else if (key == "target-syntax") {
|
||||
m_targetSyntax = Scan<bool>(value);
|
||||
m_targetSyntax = Scan<bool>(value);
|
||||
} else {
|
||||
StatelessFeatureFunction::SetParameter(key, value);
|
||||
}
|
||||
|
@ -9,8 +9,9 @@ class CountNonTerms : public StatelessFeatureFunction
|
||||
{
|
||||
public:
|
||||
CountNonTerms(const std::string &line);
|
||||
bool IsUseable(const FactorMask &mask) const
|
||||
{ return true; }
|
||||
bool IsUseable(const FactorMask &mask) const {
|
||||
return true;
|
||||
}
|
||||
|
||||
void Evaluate(const Phrase &source
|
||||
, const TargetPhrase &targetPhrase
|
||||
|
@ -5,439 +5,440 @@
|
||||
|
||||
namespace Moses
|
||||
{
|
||||
|
||||
std::map< const std::string, DynamicCacheBasedLanguageModel * > DynamicCacheBasedLanguageModel::s_instance_map;
|
||||
DynamicCacheBasedLanguageModel *DynamicCacheBasedLanguageModel::s_instance = NULL;
|
||||
|
||||
DynamicCacheBasedLanguageModel::DynamicCacheBasedLanguageModel(const std::string &line)
|
||||
|
||||
std::map< const std::string, DynamicCacheBasedLanguageModel * > DynamicCacheBasedLanguageModel::s_instance_map;
|
||||
DynamicCacheBasedLanguageModel *DynamicCacheBasedLanguageModel::s_instance = NULL;
|
||||
|
||||
DynamicCacheBasedLanguageModel::DynamicCacheBasedLanguageModel(const std::string &line)
|
||||
: StatelessFeatureFunction(1, line)
|
||||
{
|
||||
VERBOSE(2,"Initializing DynamicCacheBasedLanguageModel feature..." << std::endl);
|
||||
|
||||
m_query_type = CBLM_QUERY_TYPE_ALLSUBSTRINGS;
|
||||
m_score_type = CBLM_SCORE_TYPE_HYPERBOLA;
|
||||
m_maxAge = 1000;
|
||||
m_name = "default";
|
||||
|
||||
ReadParameters();
|
||||
UTIL_THROW_IF2(s_instance_map.find(m_name) != s_instance_map.end(), "Only 1 DynamicCacheBasedLanguageModel feature named " + m_name + " is allowed");
|
||||
s_instance_map[m_name] = this;
|
||||
s_instance = this; //for back compatibility
|
||||
}
|
||||
|
||||
DynamicCacheBasedLanguageModel::~DynamicCacheBasedLanguageModel() {};
|
||||
|
||||
void DynamicCacheBasedLanguageModel::SetPreComputedScores()
|
||||
{
|
||||
#ifdef WITH_THREADS
|
||||
boost::shared_lock<boost::shared_mutex> lock(m_cacheLock);
|
||||
#endif
|
||||
precomputedScores.clear();
|
||||
for (unsigned int i=0; i<m_maxAge; i++) {
|
||||
precomputedScores.push_back(decaying_score(i));
|
||||
}
|
||||
|
||||
if ( m_score_type == CBLM_SCORE_TYPE_HYPERBOLA
|
||||
|| m_score_type == CBLM_SCORE_TYPE_POWER
|
||||
|| m_score_type == CBLM_SCORE_TYPE_EXPONENTIAL
|
||||
|| m_score_type == CBLM_SCORE_TYPE_COSINE ) {
|
||||
precomputedScores.push_back(decaying_score(m_maxAge));
|
||||
} else { // m_score_type = CBLM_SCORE_TYPE_XXXXXXXXX_REWARD
|
||||
precomputedScores.push_back(0.0);
|
||||
}
|
||||
m_lower_score = precomputedScores[m_maxAge];
|
||||
std::cerr << "SetPreComputedScores(): lower_score:" << m_lower_score << std::endl;
|
||||
}
|
||||
|
||||
float DynamicCacheBasedLanguageModel::GetPreComputedScores(const unsigned int age)
|
||||
{
|
||||
if (age < precomputedScores.size()) {
|
||||
return precomputedScores.at(age);
|
||||
} else {
|
||||
return precomputedScores.at(m_maxAge);
|
||||
}
|
||||
}
|
||||
|
||||
void DynamicCacheBasedLanguageModel::SetParameter(const std::string& key, const std::string& value)
|
||||
{
|
||||
VERBOSE(2, "DynamicCacheBasedLanguageModel::SetParameter key:|" << key << "| value:|" << value << "|" << std::endl);
|
||||
if (key == "cblm-query-type") {
|
||||
SetQueryType(Scan<size_t>(value));
|
||||
} else if (key == "cblm-score-type") {
|
||||
SetScoreType(Scan<size_t>(value));
|
||||
} else if (key == "cblm-max-age") {
|
||||
SetMaxAge(Scan<unsigned int>(value));
|
||||
} else if (key == "cblm-file") {
|
||||
m_initfiles = Scan<std::string>(value);
|
||||
} else if (key == "cblm-name") {
|
||||
m_name = Scan<std::string>(value);
|
||||
} else {
|
||||
StatelessFeatureFunction::SetParameter(key, value);
|
||||
}
|
||||
}
|
||||
|
||||
void DynamicCacheBasedLanguageModel::Evaluate(const Phrase &sp
|
||||
, const TargetPhrase &tp
|
||||
, ScoreComponentCollection &scoreBreakdown
|
||||
, ScoreComponentCollection &estimatedFutureScore) const
|
||||
{
|
||||
float score = m_lower_score;
|
||||
switch(m_query_type) {
|
||||
case CBLM_QUERY_TYPE_WHOLESTRING:
|
||||
score = Evaluate_Whole_String(tp);
|
||||
break;
|
||||
case CBLM_QUERY_TYPE_ALLSUBSTRINGS:
|
||||
score = Evaluate_All_Substrings(tp);
|
||||
break;
|
||||
default:
|
||||
UTIL_THROW_IF2(false, "This score type (" << m_query_type << ") is unknown.");
|
||||
}
|
||||
|
||||
scoreBreakdown.Assign(this, score);
|
||||
}
|
||||
|
||||
float DynamicCacheBasedLanguageModel::Evaluate_Whole_String(const TargetPhrase& tp) const
|
||||
{
|
||||
//consider all words in the TargetPhrase as one n-gram
|
||||
// and compute the decaying_score for the whole n-gram
|
||||
// and return this value
|
||||
|
||||
decaying_cache_t::const_iterator it;
|
||||
float score = m_lower_score;
|
||||
|
||||
std::string w = "";
|
||||
size_t endpos = tp.GetSize();
|
||||
for (size_t pos = 0 ; pos < endpos ; ++pos) {
|
||||
w += tp.GetWord(pos).GetFactor(0)->GetString().as_string();
|
||||
if ((pos == 0) && (endpos > 1)) {
|
||||
w += " ";
|
||||
}
|
||||
}
|
||||
it = m_cache.find(w);
|
||||
|
||||
VERBOSE(4,"cblm::Evaluate_Whole_String: searching w:|" << w << "|" << std::endl);
|
||||
if (it != m_cache.end()) { //found!
|
||||
score = ((*it).second).second;
|
||||
VERBOSE(4,"cblm::Evaluate_Whole_String: found w:|" << w << "|" << std::endl);
|
||||
}
|
||||
|
||||
VERBOSE(4,"cblm::Evaluate_Whole_String: returning score:|" << score << "|" << std::endl);
|
||||
return score;
|
||||
}
|
||||
|
||||
float DynamicCacheBasedLanguageModel::Evaluate_All_Substrings(const TargetPhrase& tp) const
|
||||
{
|
||||
//loop over all n-grams in the TargetPhrase (no matter of n)
|
||||
//and compute the decaying_score for all words
|
||||
//and return their sum
|
||||
|
||||
decaying_cache_t::const_iterator it;
|
||||
float score = 0.0;
|
||||
|
||||
for (size_t startpos = 0 ; startpos < tp.GetSize() ; ++startpos) {
|
||||
std::string w = "";
|
||||
for (size_t endpos = startpos; endpos < tp.GetSize() ; ++endpos) {
|
||||
w += tp.GetWord(endpos).GetFactor(0)->GetString().as_string();
|
||||
it = m_cache.find(w);
|
||||
|
||||
if (it != m_cache.end()) { //found!
|
||||
score += ((*it).second).second;
|
||||
VERBOSE(3,"cblm::Evaluate_All_Substrings: found w:|" << w << "| actual score:|" << ((*it).second).second << "| score:|" << score << "|" << std::endl);
|
||||
} else {
|
||||
score += m_lower_score;
|
||||
}
|
||||
|
||||
if (endpos == startpos) {
|
||||
w += " ";
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
VERBOSE(3,"cblm::Evaluate_All_Substrings: returning score:|" << score << "|" << std::endl);
|
||||
return score;
|
||||
}
|
||||
|
||||
void DynamicCacheBasedLanguageModel::Print() const
|
||||
{
|
||||
#ifdef WITH_THREADS
|
||||
boost::shared_lock<boost::shared_mutex> read_lock(m_cacheLock);
|
||||
#endif
|
||||
decaying_cache_t::const_iterator it;
|
||||
std::cout << "Content of the cache of Cache-Based Language Model" << std::endl;
|
||||
std::cout << "Size of the cache of Cache-Based Language Model:|" << m_cache.size() << "|" << std::endl;
|
||||
for ( it=m_cache.begin() ; it != m_cache.end(); it++ ) {
|
||||
std::cout << "word:|" << (*it).first << "| age:|" << ((*it).second).first << "| score:|" << ((*it).second).second << "|" << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
void DynamicCacheBasedLanguageModel::Decay()
|
||||
{
|
||||
#ifdef WITH_THREADS
|
||||
boost::shared_lock<boost::shared_mutex> lock(m_cacheLock);
|
||||
#endif
|
||||
decaying_cache_t::iterator it;
|
||||
|
||||
unsigned int age;
|
||||
float score;
|
||||
for ( it=m_cache.begin() ; it != m_cache.end(); it++ ) {
|
||||
age=((*it).second).first + 1;
|
||||
if (age > m_maxAge) {
|
||||
m_cache.erase(it);
|
||||
it--;
|
||||
} else {
|
||||
score = decaying_score(age);
|
||||
decaying_cache_value_t p (age, score);
|
||||
(*it).second = p;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void DynamicCacheBasedLanguageModel::Update(std::vector<std::string> words, int age)
|
||||
{
|
||||
#ifdef WITH_THREADS
|
||||
boost::shared_lock<boost::shared_mutex> lock(m_cacheLock);
|
||||
#endif
|
||||
VERBOSE(3,"words.size():|" << words.size() << "|" << std::endl);
|
||||
for (size_t j=0; j<words.size(); j++) {
|
||||
words[j] = Trim(words[j]);
|
||||
VERBOSE(3,"CacheBasedLanguageModel::Update word[" << j << "]:"<< words[j] << " age:" << age << " decaying_score(age):" << decaying_score(age) << std::endl);
|
||||
decaying_cache_value_t p (age,decaying_score(age));
|
||||
std::pair<std::string, decaying_cache_value_t> e (words[j],p);
|
||||
m_cache.erase(words[j]); //always erase the element (do nothing if the entry does not exist)
|
||||
m_cache.insert(e); //insert the entry
|
||||
}
|
||||
}
|
||||
|
||||
void DynamicCacheBasedLanguageModel::ClearEntries(std::string &entries)
|
||||
{
|
||||
if (entries != "") {
|
||||
VERBOSE(3,"entries:|" << entries << "|" << std::endl);
|
||||
std::vector<std::string> elements = TokenizeMultiCharSeparator(entries, "||");
|
||||
VERBOSE(3,"elements.size() after:|" << elements.size() << "|" << std::endl);
|
||||
ClearEntries(elements);
|
||||
}
|
||||
}
|
||||
|
||||
void DynamicCacheBasedLanguageModel::ClearEntries(std::vector<std::string> words)
|
||||
{
|
||||
#ifdef WITH_THREADS
|
||||
boost::shared_lock<boost::shared_mutex> lock(m_cacheLock);
|
||||
#endif
|
||||
VERBOSE(3,"words.size():|" << words.size() << "|" << std::endl);
|
||||
for (size_t j=0; j<words.size(); j++) { words[j] = Trim(words[j]);
|
||||
VERBOSE(3,"CacheBasedLanguageModel::ClearEntries word[" << j << "]:"<< words[j] << std::endl);
|
||||
m_cache.erase(words[j]); //always erase the element (do nothing if the entry does not exist)
|
||||
}
|
||||
}
|
||||
|
||||
void DynamicCacheBasedLanguageModel::Insert(std::string &entries)
|
||||
{
|
||||
if (entries != "") {
|
||||
VERBOSE(3,"entries:|" << entries << "|" << std::endl);
|
||||
std::vector<std::string> elements = TokenizeMultiCharSeparator(entries, "||");
|
||||
VERBOSE(3,"elements.size() after:|" << elements.size() << "|" << std::endl);
|
||||
Insert(elements);
|
||||
}
|
||||
}
|
||||
|
||||
void DynamicCacheBasedLanguageModel::Insert(std::vector<std::string> ngrams)
|
||||
{
|
||||
VERBOSE(3,"DynamicCacheBasedLanguageModel Insert ngrams.size():|" << ngrams.size() << "|" << std::endl);
|
||||
Decay();
|
||||
Update(ngrams,1);
|
||||
// Print();
|
||||
IFVERBOSE(2) Print();
|
||||
}
|
||||
|
||||
void DynamicCacheBasedLanguageModel::ExecuteDlt(std::map<std::string, std::string> dlt_meta)
|
||||
{
|
||||
if (dlt_meta.find("cblm") != dlt_meta.end()) {
|
||||
Insert(dlt_meta["cblm"]);
|
||||
}
|
||||
if (dlt_meta.find("cblm-command") != dlt_meta.end()) {
|
||||
Execute(dlt_meta["cblm-command"]);
|
||||
}
|
||||
if (dlt_meta.find("cblm-file") != dlt_meta.end()) {
|
||||
Load(dlt_meta["cblm-file"]);
|
||||
}
|
||||
if (dlt_meta.find("cblm-clear-entries") != dlt_meta.end()) {
|
||||
ClearEntries(dlt_meta["cblm-clear-entries"]);
|
||||
}
|
||||
if (dlt_meta.find("cblm-clear-all") != dlt_meta.end()) {
|
||||
Clear();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
void DynamicCacheBasedLanguageModel::Execute(std::string command)
|
||||
{
|
||||
VERBOSE(2,"DynamicCacheBasedLanguageModel::Execute(std::string command:|" << command << "|" << std::endl);
|
||||
std::vector<std::string> commands = Tokenize(command, "||");
|
||||
Execute(commands);
|
||||
}
|
||||
|
||||
void DynamicCacheBasedLanguageModel::Execute(std::vector<std::string> commands)
|
||||
{
|
||||
for (size_t j=0; j<commands.size(); j++) {
|
||||
Execute_Single_Command(commands[j]);
|
||||
}
|
||||
IFVERBOSE(2) Print();
|
||||
}
|
||||
|
||||
void DynamicCacheBasedLanguageModel::Execute_Single_Command(std::string command)
|
||||
{
|
||||
VERBOSE(2,"CacheBasedLanguageModel::Execute_Single_Command(std::string command:|" << command << "|" << std::endl);
|
||||
if (command == "clear") {
|
||||
VERBOSE(2,"CacheBasedLanguageModel Execute command:|"<< command << "|. Cache cleared." << std::endl);
|
||||
Clear();
|
||||
} else if (command == "settype_wholestring") {
|
||||
VERBOSE(2,"CacheBasedLanguageModel Execute command:|"<< command << "|. Query type set to " << CBLM_QUERY_TYPE_WHOLESTRING << " (CBLM_QUERY_TYPE_WHOLESTRING)." << std::endl);
|
||||
SetQueryType(CBLM_QUERY_TYPE_WHOLESTRING);
|
||||
} else if (command == "settype_allsubstrings") {
|
||||
VERBOSE(2,"CacheBasedLanguageModel Execute command:|"<< command << "|. Query type set to " << CBLM_QUERY_TYPE_ALLSUBSTRINGS << " (CBLM_QUERY_TYPE_ALLSUBSTRINGS)." << std::endl);
|
||||
SetQueryType(CBLM_QUERY_TYPE_ALLSUBSTRINGS);
|
||||
} else {
|
||||
VERBOSE(2,"CacheBasedLanguageModel Execute command:|"<< command << "| is unknown. Skipped." << std::endl);
|
||||
}
|
||||
}
|
||||
|
||||
void DynamicCacheBasedLanguageModel::Clear()
|
||||
{
|
||||
#ifdef WITH_THREADS
|
||||
boost::shared_lock<boost::shared_mutex> lock(m_cacheLock);
|
||||
#endif
|
||||
m_cache.clear();
|
||||
}
|
||||
|
||||
void DynamicCacheBasedLanguageModel::Load()
|
||||
{
|
||||
SetPreComputedScores();
|
||||
VERBOSE(2,"DynamicCacheBasedLanguageModel::Load()" << std::endl);
|
||||
Load(m_initfiles);
|
||||
}
|
||||
|
||||
void DynamicCacheBasedLanguageModel::Load(const std::string file)
|
||||
{
|
||||
VERBOSE(2,"DynamicCacheBasedLanguageModel::Load(const std::string file)" << std::endl);
|
||||
std::vector<std::string> files = Tokenize(m_initfiles, "||");
|
||||
Load_Multiple_Files(files);
|
||||
}
|
||||
|
||||
|
||||
void DynamicCacheBasedLanguageModel::Load_Multiple_Files(std::vector<std::string> files)
|
||||
{
|
||||
VERBOSE(2,"DynamicCacheBasedLanguageModel::Load_Multiple_Files(std::vector<std::string> files)" << std::endl);
|
||||
for(size_t j = 0; j < files.size(); ++j) {
|
||||
Load_Single_File(files[j]);
|
||||
}
|
||||
}
|
||||
|
||||
void DynamicCacheBasedLanguageModel::Load_Single_File(const std::string file)
|
||||
{
|
||||
VERBOSE(2,"DynamicCacheBasedLanguageModel::Load_Single_File(const std::string file)" << std::endl);
|
||||
//file format
|
||||
//age || n-gram
|
||||
//age || n-gram || n-gram || n-gram || ...
|
||||
//....
|
||||
//each n-gram is a sequence of n words (no matter of n)
|
||||
//
|
||||
//there is no limit on the size of n
|
||||
//
|
||||
//entries can be repeated, but the last entry overwrites the previous
|
||||
|
||||
|
||||
VERBOSE(2,"Loading data from the cache file " << file << std::endl);
|
||||
InputFileStream cacheFile(file);
|
||||
|
||||
std::string line;
|
||||
int age;
|
||||
std::vector<std::string> words;
|
||||
|
||||
while (getline(cacheFile, line)) {
|
||||
std::vector<std::string> vecStr = TokenizeMultiCharSeparator( line , "||" );
|
||||
if (vecStr.size() >= 2) {
|
||||
age = Scan<int>(vecStr[0]);
|
||||
vecStr.erase(vecStr.begin());
|
||||
Update(vecStr,age);
|
||||
} else {
|
||||
UTIL_THROW_IF2(false, "The format of the loaded file is wrong: " << line);
|
||||
}
|
||||
}
|
||||
IFVERBOSE(2) Print();
|
||||
}
|
||||
|
||||
void DynamicCacheBasedLanguageModel::SetQueryType(size_t type)
|
||||
{
|
||||
#ifdef WITH_THREADS
|
||||
boost::shared_lock<boost::shared_mutex> read_lock(m_cacheLock);
|
||||
#endif
|
||||
|
||||
m_query_type = type;
|
||||
if ( m_query_type != CBLM_QUERY_TYPE_WHOLESTRING
|
||||
&& m_query_type != CBLM_QUERY_TYPE_ALLSUBSTRINGS ) {
|
||||
VERBOSE(2, "This query type " << m_query_type << " is unknown. Instead used " << CBLM_QUERY_TYPE_ALLSUBSTRINGS << "." << std::endl);
|
||||
m_query_type = CBLM_QUERY_TYPE_ALLSUBSTRINGS;
|
||||
}
|
||||
VERBOSE(2, "CacheBasedLanguageModel QueryType: " << m_query_type << std::endl);
|
||||
|
||||
};
|
||||
|
||||
void DynamicCacheBasedLanguageModel::SetScoreType(size_t type)
|
||||
{
|
||||
#ifdef WITH_THREADS
|
||||
boost::shared_lock<boost::shared_mutex> read_lock(m_cacheLock);
|
||||
#endif
|
||||
m_score_type = type;
|
||||
if ( m_score_type != CBLM_SCORE_TYPE_HYPERBOLA
|
||||
&& m_score_type != CBLM_SCORE_TYPE_POWER
|
||||
&& m_score_type != CBLM_SCORE_TYPE_EXPONENTIAL
|
||||
&& m_score_type != CBLM_SCORE_TYPE_COSINE
|
||||
&& m_score_type != CBLM_SCORE_TYPE_HYPERBOLA_REWARD
|
||||
&& m_score_type != CBLM_SCORE_TYPE_POWER_REWARD
|
||||
&& m_score_type != CBLM_SCORE_TYPE_EXPONENTIAL_REWARD ) {
|
||||
VERBOSE(2, "This score type " << m_score_type << " is unknown. Instead used " << CBLM_SCORE_TYPE_HYPERBOLA << "." << std::endl);
|
||||
m_score_type = CBLM_SCORE_TYPE_HYPERBOLA;
|
||||
}
|
||||
VERBOSE(2, "CacheBasedLanguageModel ScoreType: " << m_score_type << std::endl);
|
||||
};
|
||||
|
||||
void DynamicCacheBasedLanguageModel::SetMaxAge(unsigned int age)
|
||||
{
|
||||
#ifdef WITH_THREADS
|
||||
boost::shared_lock<boost::shared_mutex> read_lock(m_cacheLock);
|
||||
#endif
|
||||
m_maxAge = age;
|
||||
VERBOSE(2, "CacheBasedLanguageModel MaxAge: " << m_maxAge << std::endl);
|
||||
};
|
||||
|
||||
float DynamicCacheBasedLanguageModel::decaying_score(const unsigned int age)
|
||||
{
|
||||
float sc;
|
||||
switch(m_score_type) {
|
||||
case CBLM_SCORE_TYPE_HYPERBOLA:
|
||||
sc = (float) 1.0/age - 1.0;
|
||||
break;
|
||||
case CBLM_SCORE_TYPE_POWER:
|
||||
sc = (float) pow(age, -0.25) - 1.0;
|
||||
break;
|
||||
case CBLM_SCORE_TYPE_EXPONENTIAL:
|
||||
sc = (age == 1) ? 0.0 : (float) exp( 1.0/age ) / exp(1.0) - 1.0;
|
||||
break;
|
||||
case CBLM_SCORE_TYPE_COSINE:
|
||||
sc = (float) cos( (age-1) * (PI/2) / m_maxAge ) - 1.0;
|
||||
break;
|
||||
case CBLM_SCORE_TYPE_HYPERBOLA_REWARD:
|
||||
sc = (float) 1.0/age;
|
||||
break;
|
||||
case CBLM_SCORE_TYPE_POWER_REWARD:
|
||||
sc = (float) pow(age, -0.25);
|
||||
break;
|
||||
case CBLM_SCORE_TYPE_EXPONENTIAL_REWARD:
|
||||
sc = (age == 1) ? 1.0 : (float) exp( 1.0/age ) / exp(1.0);
|
||||
break;
|
||||
default:
|
||||
sc = -1.0;
|
||||
}
|
||||
return sc;
|
||||
}
|
||||
{
|
||||
VERBOSE(2,"Initializing DynamicCacheBasedLanguageModel feature..." << std::endl);
|
||||
|
||||
m_query_type = CBLM_QUERY_TYPE_ALLSUBSTRINGS;
|
||||
m_score_type = CBLM_SCORE_TYPE_HYPERBOLA;
|
||||
m_maxAge = 1000;
|
||||
m_name = "default";
|
||||
|
||||
ReadParameters();
|
||||
UTIL_THROW_IF2(s_instance_map.find(m_name) != s_instance_map.end(), "Only 1 DynamicCacheBasedLanguageModel feature named " + m_name + " is allowed");
|
||||
s_instance_map[m_name] = this;
|
||||
s_instance = this; //for back compatibility
|
||||
}
|
||||
|
||||
DynamicCacheBasedLanguageModel::~DynamicCacheBasedLanguageModel() {};
|
||||
|
||||
void DynamicCacheBasedLanguageModel::SetPreComputedScores()
|
||||
{
|
||||
#ifdef WITH_THREADS
|
||||
boost::shared_lock<boost::shared_mutex> lock(m_cacheLock);
|
||||
#endif
|
||||
precomputedScores.clear();
|
||||
for (unsigned int i=0; i<m_maxAge; i++) {
|
||||
precomputedScores.push_back(decaying_score(i));
|
||||
}
|
||||
|
||||
if ( m_score_type == CBLM_SCORE_TYPE_HYPERBOLA
|
||||
|| m_score_type == CBLM_SCORE_TYPE_POWER
|
||||
|| m_score_type == CBLM_SCORE_TYPE_EXPONENTIAL
|
||||
|| m_score_type == CBLM_SCORE_TYPE_COSINE ) {
|
||||
precomputedScores.push_back(decaying_score(m_maxAge));
|
||||
} else { // m_score_type = CBLM_SCORE_TYPE_XXXXXXXXX_REWARD
|
||||
precomputedScores.push_back(0.0);
|
||||
}
|
||||
m_lower_score = precomputedScores[m_maxAge];
|
||||
std::cerr << "SetPreComputedScores(): lower_score:" << m_lower_score << std::endl;
|
||||
}
|
||||
|
||||
float DynamicCacheBasedLanguageModel::GetPreComputedScores(const unsigned int age)
|
||||
{
|
||||
if (age < precomputedScores.size()) {
|
||||
return precomputedScores.at(age);
|
||||
} else {
|
||||
return precomputedScores.at(m_maxAge);
|
||||
}
|
||||
}
|
||||
|
||||
void DynamicCacheBasedLanguageModel::SetParameter(const std::string& key, const std::string& value)
|
||||
{
|
||||
VERBOSE(2, "DynamicCacheBasedLanguageModel::SetParameter key:|" << key << "| value:|" << value << "|" << std::endl);
|
||||
if (key == "cblm-query-type") {
|
||||
SetQueryType(Scan<size_t>(value));
|
||||
} else if (key == "cblm-score-type") {
|
||||
SetScoreType(Scan<size_t>(value));
|
||||
} else if (key == "cblm-max-age") {
|
||||
SetMaxAge(Scan<unsigned int>(value));
|
||||
} else if (key == "cblm-file") {
|
||||
m_initfiles = Scan<std::string>(value);
|
||||
} else if (key == "cblm-name") {
|
||||
m_name = Scan<std::string>(value);
|
||||
} else {
|
||||
StatelessFeatureFunction::SetParameter(key, value);
|
||||
}
|
||||
}
|
||||
|
||||
void DynamicCacheBasedLanguageModel::Evaluate(const Phrase &sp
|
||||
, const TargetPhrase &tp
|
||||
, ScoreComponentCollection &scoreBreakdown
|
||||
, ScoreComponentCollection &estimatedFutureScore) const
|
||||
{
|
||||
float score = m_lower_score;
|
||||
switch(m_query_type) {
|
||||
case CBLM_QUERY_TYPE_WHOLESTRING:
|
||||
score = Evaluate_Whole_String(tp);
|
||||
break;
|
||||
case CBLM_QUERY_TYPE_ALLSUBSTRINGS:
|
||||
score = Evaluate_All_Substrings(tp);
|
||||
break;
|
||||
default:
|
||||
UTIL_THROW_IF2(false, "This score type (" << m_query_type << ") is unknown.");
|
||||
}
|
||||
|
||||
scoreBreakdown.Assign(this, score);
|
||||
}
|
||||
|
||||
float DynamicCacheBasedLanguageModel::Evaluate_Whole_String(const TargetPhrase& tp) const
|
||||
{
|
||||
//consider all words in the TargetPhrase as one n-gram
|
||||
// and compute the decaying_score for the whole n-gram
|
||||
// and return this value
|
||||
|
||||
decaying_cache_t::const_iterator it;
|
||||
float score = m_lower_score;
|
||||
|
||||
std::string w = "";
|
||||
size_t endpos = tp.GetSize();
|
||||
for (size_t pos = 0 ; pos < endpos ; ++pos) {
|
||||
w += tp.GetWord(pos).GetFactor(0)->GetString().as_string();
|
||||
if ((pos == 0) && (endpos > 1)) {
|
||||
w += " ";
|
||||
}
|
||||
}
|
||||
it = m_cache.find(w);
|
||||
|
||||
VERBOSE(4,"cblm::Evaluate_Whole_String: searching w:|" << w << "|" << std::endl);
|
||||
if (it != m_cache.end()) { //found!
|
||||
score = ((*it).second).second;
|
||||
VERBOSE(4,"cblm::Evaluate_Whole_String: found w:|" << w << "|" << std::endl);
|
||||
}
|
||||
|
||||
VERBOSE(4,"cblm::Evaluate_Whole_String: returning score:|" << score << "|" << std::endl);
|
||||
return score;
|
||||
}
|
||||
|
||||
float DynamicCacheBasedLanguageModel::Evaluate_All_Substrings(const TargetPhrase& tp) const
|
||||
{
|
||||
//loop over all n-grams in the TargetPhrase (no matter of n)
|
||||
//and compute the decaying_score for all words
|
||||
//and return their sum
|
||||
|
||||
decaying_cache_t::const_iterator it;
|
||||
float score = 0.0;
|
||||
|
||||
for (size_t startpos = 0 ; startpos < tp.GetSize() ; ++startpos) {
|
||||
std::string w = "";
|
||||
for (size_t endpos = startpos; endpos < tp.GetSize() ; ++endpos) {
|
||||
w += tp.GetWord(endpos).GetFactor(0)->GetString().as_string();
|
||||
it = m_cache.find(w);
|
||||
|
||||
if (it != m_cache.end()) { //found!
|
||||
score += ((*it).second).second;
|
||||
VERBOSE(3,"cblm::Evaluate_All_Substrings: found w:|" << w << "| actual score:|" << ((*it).second).second << "| score:|" << score << "|" << std::endl);
|
||||
} else {
|
||||
score += m_lower_score;
|
||||
}
|
||||
|
||||
if (endpos == startpos) {
|
||||
w += " ";
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
VERBOSE(3,"cblm::Evaluate_All_Substrings: returning score:|" << score << "|" << std::endl);
|
||||
return score;
|
||||
}
|
||||
|
||||
void DynamicCacheBasedLanguageModel::Print() const
|
||||
{
|
||||
#ifdef WITH_THREADS
|
||||
boost::shared_lock<boost::shared_mutex> read_lock(m_cacheLock);
|
||||
#endif
|
||||
decaying_cache_t::const_iterator it;
|
||||
std::cout << "Content of the cache of Cache-Based Language Model" << std::endl;
|
||||
std::cout << "Size of the cache of Cache-Based Language Model:|" << m_cache.size() << "|" << std::endl;
|
||||
for ( it=m_cache.begin() ; it != m_cache.end(); it++ ) {
|
||||
std::cout << "word:|" << (*it).first << "| age:|" << ((*it).second).first << "| score:|" << ((*it).second).second << "|" << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
void DynamicCacheBasedLanguageModel::Decay()
|
||||
{
|
||||
#ifdef WITH_THREADS
|
||||
boost::shared_lock<boost::shared_mutex> lock(m_cacheLock);
|
||||
#endif
|
||||
decaying_cache_t::iterator it;
|
||||
|
||||
unsigned int age;
|
||||
float score;
|
||||
for ( it=m_cache.begin() ; it != m_cache.end(); it++ ) {
|
||||
age=((*it).second).first + 1;
|
||||
if (age > m_maxAge) {
|
||||
m_cache.erase(it);
|
||||
it--;
|
||||
} else {
|
||||
score = decaying_score(age);
|
||||
decaying_cache_value_t p (age, score);
|
||||
(*it).second = p;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void DynamicCacheBasedLanguageModel::Update(std::vector<std::string> words, int age)
|
||||
{
|
||||
#ifdef WITH_THREADS
|
||||
boost::shared_lock<boost::shared_mutex> lock(m_cacheLock);
|
||||
#endif
|
||||
VERBOSE(3,"words.size():|" << words.size() << "|" << std::endl);
|
||||
for (size_t j=0; j<words.size(); j++) {
|
||||
words[j] = Trim(words[j]);
|
||||
VERBOSE(3,"CacheBasedLanguageModel::Update word[" << j << "]:"<< words[j] << " age:" << age << " decaying_score(age):" << decaying_score(age) << std::endl);
|
||||
decaying_cache_value_t p (age,decaying_score(age));
|
||||
std::pair<std::string, decaying_cache_value_t> e (words[j],p);
|
||||
m_cache.erase(words[j]); //always erase the element (do nothing if the entry does not exist)
|
||||
m_cache.insert(e); //insert the entry
|
||||
}
|
||||
}
|
||||
|
||||
void DynamicCacheBasedLanguageModel::ClearEntries(std::string &entries)
|
||||
{
|
||||
if (entries != "") {
|
||||
VERBOSE(3,"entries:|" << entries << "|" << std::endl);
|
||||
std::vector<std::string> elements = TokenizeMultiCharSeparator(entries, "||");
|
||||
VERBOSE(3,"elements.size() after:|" << elements.size() << "|" << std::endl);
|
||||
ClearEntries(elements);
|
||||
}
|
||||
}
|
||||
|
||||
void DynamicCacheBasedLanguageModel::ClearEntries(std::vector<std::string> words)
|
||||
{
|
||||
#ifdef WITH_THREADS
|
||||
boost::shared_lock<boost::shared_mutex> lock(m_cacheLock);
|
||||
#endif
|
||||
VERBOSE(3,"words.size():|" << words.size() << "|" << std::endl);
|
||||
for (size_t j=0; j<words.size(); j++) {
|
||||
words[j] = Trim(words[j]);
|
||||
VERBOSE(3,"CacheBasedLanguageModel::ClearEntries word[" << j << "]:"<< words[j] << std::endl);
|
||||
m_cache.erase(words[j]); //always erase the element (do nothing if the entry does not exist)
|
||||
}
|
||||
}
|
||||
|
||||
void DynamicCacheBasedLanguageModel::Insert(std::string &entries)
|
||||
{
|
||||
if (entries != "") {
|
||||
VERBOSE(3,"entries:|" << entries << "|" << std::endl);
|
||||
std::vector<std::string> elements = TokenizeMultiCharSeparator(entries, "||");
|
||||
VERBOSE(3,"elements.size() after:|" << elements.size() << "|" << std::endl);
|
||||
Insert(elements);
|
||||
}
|
||||
}
|
||||
|
||||
void DynamicCacheBasedLanguageModel::Insert(std::vector<std::string> ngrams)
|
||||
{
|
||||
VERBOSE(3,"DynamicCacheBasedLanguageModel Insert ngrams.size():|" << ngrams.size() << "|" << std::endl);
|
||||
Decay();
|
||||
Update(ngrams,1);
|
||||
// Print();
|
||||
IFVERBOSE(2) Print();
|
||||
}
|
||||
|
||||
void DynamicCacheBasedLanguageModel::ExecuteDlt(std::map<std::string, std::string> dlt_meta)
|
||||
{
|
||||
if (dlt_meta.find("cblm") != dlt_meta.end()) {
|
||||
Insert(dlt_meta["cblm"]);
|
||||
}
|
||||
if (dlt_meta.find("cblm-command") != dlt_meta.end()) {
|
||||
Execute(dlt_meta["cblm-command"]);
|
||||
}
|
||||
if (dlt_meta.find("cblm-file") != dlt_meta.end()) {
|
||||
Load(dlt_meta["cblm-file"]);
|
||||
}
|
||||
if (dlt_meta.find("cblm-clear-entries") != dlt_meta.end()) {
|
||||
ClearEntries(dlt_meta["cblm-clear-entries"]);
|
||||
}
|
||||
if (dlt_meta.find("cblm-clear-all") != dlt_meta.end()) {
|
||||
Clear();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
void DynamicCacheBasedLanguageModel::Execute(std::string command)
|
||||
{
|
||||
VERBOSE(2,"DynamicCacheBasedLanguageModel::Execute(std::string command:|" << command << "|" << std::endl);
|
||||
std::vector<std::string> commands = Tokenize(command, "||");
|
||||
Execute(commands);
|
||||
}
|
||||
|
||||
void DynamicCacheBasedLanguageModel::Execute(std::vector<std::string> commands)
|
||||
{
|
||||
for (size_t j=0; j<commands.size(); j++) {
|
||||
Execute_Single_Command(commands[j]);
|
||||
}
|
||||
IFVERBOSE(2) Print();
|
||||
}
|
||||
|
||||
void DynamicCacheBasedLanguageModel::Execute_Single_Command(std::string command)
|
||||
{
|
||||
VERBOSE(2,"CacheBasedLanguageModel::Execute_Single_Command(std::string command:|" << command << "|" << std::endl);
|
||||
if (command == "clear") {
|
||||
VERBOSE(2,"CacheBasedLanguageModel Execute command:|"<< command << "|. Cache cleared." << std::endl);
|
||||
Clear();
|
||||
} else if (command == "settype_wholestring") {
|
||||
VERBOSE(2,"CacheBasedLanguageModel Execute command:|"<< command << "|. Query type set to " << CBLM_QUERY_TYPE_WHOLESTRING << " (CBLM_QUERY_TYPE_WHOLESTRING)." << std::endl);
|
||||
SetQueryType(CBLM_QUERY_TYPE_WHOLESTRING);
|
||||
} else if (command == "settype_allsubstrings") {
|
||||
VERBOSE(2,"CacheBasedLanguageModel Execute command:|"<< command << "|. Query type set to " << CBLM_QUERY_TYPE_ALLSUBSTRINGS << " (CBLM_QUERY_TYPE_ALLSUBSTRINGS)." << std::endl);
|
||||
SetQueryType(CBLM_QUERY_TYPE_ALLSUBSTRINGS);
|
||||
} else {
|
||||
VERBOSE(2,"CacheBasedLanguageModel Execute command:|"<< command << "| is unknown. Skipped." << std::endl);
|
||||
}
|
||||
}
|
||||
|
||||
void DynamicCacheBasedLanguageModel::Clear()
|
||||
{
|
||||
#ifdef WITH_THREADS
|
||||
boost::shared_lock<boost::shared_mutex> lock(m_cacheLock);
|
||||
#endif
|
||||
m_cache.clear();
|
||||
}
|
||||
|
||||
void DynamicCacheBasedLanguageModel::Load()
|
||||
{
|
||||
SetPreComputedScores();
|
||||
VERBOSE(2,"DynamicCacheBasedLanguageModel::Load()" << std::endl);
|
||||
Load(m_initfiles);
|
||||
}
|
||||
|
||||
void DynamicCacheBasedLanguageModel::Load(const std::string file)
|
||||
{
|
||||
VERBOSE(2,"DynamicCacheBasedLanguageModel::Load(const std::string file)" << std::endl);
|
||||
std::vector<std::string> files = Tokenize(m_initfiles, "||");
|
||||
Load_Multiple_Files(files);
|
||||
}
|
||||
|
||||
|
||||
void DynamicCacheBasedLanguageModel::Load_Multiple_Files(std::vector<std::string> files)
|
||||
{
|
||||
VERBOSE(2,"DynamicCacheBasedLanguageModel::Load_Multiple_Files(std::vector<std::string> files)" << std::endl);
|
||||
for(size_t j = 0; j < files.size(); ++j) {
|
||||
Load_Single_File(files[j]);
|
||||
}
|
||||
}
|
||||
|
||||
void DynamicCacheBasedLanguageModel::Load_Single_File(const std::string file)
|
||||
{
|
||||
VERBOSE(2,"DynamicCacheBasedLanguageModel::Load_Single_File(const std::string file)" << std::endl);
|
||||
//file format
|
||||
//age || n-gram
|
||||
//age || n-gram || n-gram || n-gram || ...
|
||||
//....
|
||||
//each n-gram is a sequence of n words (no matter of n)
|
||||
//
|
||||
//there is no limit on the size of n
|
||||
//
|
||||
//entries can be repeated, but the last entry overwrites the previous
|
||||
|
||||
|
||||
VERBOSE(2,"Loading data from the cache file " << file << std::endl);
|
||||
InputFileStream cacheFile(file);
|
||||
|
||||
std::string line;
|
||||
int age;
|
||||
std::vector<std::string> words;
|
||||
|
||||
while (getline(cacheFile, line)) {
|
||||
std::vector<std::string> vecStr = TokenizeMultiCharSeparator( line , "||" );
|
||||
if (vecStr.size() >= 2) {
|
||||
age = Scan<int>(vecStr[0]);
|
||||
vecStr.erase(vecStr.begin());
|
||||
Update(vecStr,age);
|
||||
} else {
|
||||
UTIL_THROW_IF2(false, "The format of the loaded file is wrong: " << line);
|
||||
}
|
||||
}
|
||||
IFVERBOSE(2) Print();
|
||||
}
|
||||
|
||||
void DynamicCacheBasedLanguageModel::SetQueryType(size_t type)
|
||||
{
|
||||
#ifdef WITH_THREADS
|
||||
boost::shared_lock<boost::shared_mutex> read_lock(m_cacheLock);
|
||||
#endif
|
||||
|
||||
m_query_type = type;
|
||||
if ( m_query_type != CBLM_QUERY_TYPE_WHOLESTRING
|
||||
&& m_query_type != CBLM_QUERY_TYPE_ALLSUBSTRINGS ) {
|
||||
VERBOSE(2, "This query type " << m_query_type << " is unknown. Instead used " << CBLM_QUERY_TYPE_ALLSUBSTRINGS << "." << std::endl);
|
||||
m_query_type = CBLM_QUERY_TYPE_ALLSUBSTRINGS;
|
||||
}
|
||||
VERBOSE(2, "CacheBasedLanguageModel QueryType: " << m_query_type << std::endl);
|
||||
|
||||
};
|
||||
|
||||
void DynamicCacheBasedLanguageModel::SetScoreType(size_t type)
|
||||
{
|
||||
#ifdef WITH_THREADS
|
||||
boost::shared_lock<boost::shared_mutex> read_lock(m_cacheLock);
|
||||
#endif
|
||||
m_score_type = type;
|
||||
if ( m_score_type != CBLM_SCORE_TYPE_HYPERBOLA
|
||||
&& m_score_type != CBLM_SCORE_TYPE_POWER
|
||||
&& m_score_type != CBLM_SCORE_TYPE_EXPONENTIAL
|
||||
&& m_score_type != CBLM_SCORE_TYPE_COSINE
|
||||
&& m_score_type != CBLM_SCORE_TYPE_HYPERBOLA_REWARD
|
||||
&& m_score_type != CBLM_SCORE_TYPE_POWER_REWARD
|
||||
&& m_score_type != CBLM_SCORE_TYPE_EXPONENTIAL_REWARD ) {
|
||||
VERBOSE(2, "This score type " << m_score_type << " is unknown. Instead used " << CBLM_SCORE_TYPE_HYPERBOLA << "." << std::endl);
|
||||
m_score_type = CBLM_SCORE_TYPE_HYPERBOLA;
|
||||
}
|
||||
VERBOSE(2, "CacheBasedLanguageModel ScoreType: " << m_score_type << std::endl);
|
||||
};
|
||||
|
||||
void DynamicCacheBasedLanguageModel::SetMaxAge(unsigned int age)
|
||||
{
|
||||
#ifdef WITH_THREADS
|
||||
boost::shared_lock<boost::shared_mutex> read_lock(m_cacheLock);
|
||||
#endif
|
||||
m_maxAge = age;
|
||||
VERBOSE(2, "CacheBasedLanguageModel MaxAge: " << m_maxAge << std::endl);
|
||||
};
|
||||
|
||||
float DynamicCacheBasedLanguageModel::decaying_score(const unsigned int age)
|
||||
{
|
||||
float sc;
|
||||
switch(m_score_type) {
|
||||
case CBLM_SCORE_TYPE_HYPERBOLA:
|
||||
sc = (float) 1.0/age - 1.0;
|
||||
break;
|
||||
case CBLM_SCORE_TYPE_POWER:
|
||||
sc = (float) pow(age, -0.25) - 1.0;
|
||||
break;
|
||||
case CBLM_SCORE_TYPE_EXPONENTIAL:
|
||||
sc = (age == 1) ? 0.0 : (float) exp( 1.0/age ) / exp(1.0) - 1.0;
|
||||
break;
|
||||
case CBLM_SCORE_TYPE_COSINE:
|
||||
sc = (float) cos( (age-1) * (PI/2) / m_maxAge ) - 1.0;
|
||||
break;
|
||||
case CBLM_SCORE_TYPE_HYPERBOLA_REWARD:
|
||||
sc = (float) 1.0/age;
|
||||
break;
|
||||
case CBLM_SCORE_TYPE_POWER_REWARD:
|
||||
sc = (float) pow(age, -0.25);
|
||||
break;
|
||||
case CBLM_SCORE_TYPE_EXPONENTIAL_REWARD:
|
||||
sc = (age == 1) ? 1.0 : (float) exp( 1.0/age ) / exp(1.0);
|
||||
break;
|
||||
default:
|
||||
sc = -1.0;
|
||||
}
|
||||
return sc;
|
||||
}
|
||||
}
|
||||
|
@ -43,7 +43,7 @@ class DynamicCacheBasedLanguageModel : public StatelessFeatureFunction
|
||||
size_t m_query_type; //way of querying the cache
|
||||
size_t m_score_type; //way of scoring entries of the cache
|
||||
std::string m_initfiles; // vector of files loaded in the initialization phase
|
||||
std::string m_name; // internal name to identify this instance of the Cache-based pseudo LM
|
||||
std::string m_name; // internal name to identify this instance of the Cache-based pseudo LM
|
||||
float m_lower_score; //lower_bound_score for no match
|
||||
std::vector<float> precomputedScores;
|
||||
unsigned int m_maxAge;
|
||||
@ -64,7 +64,7 @@ class DynamicCacheBasedLanguageModel : public StatelessFeatureFunction
|
||||
void Update(std::vector<std::string> words, int age);
|
||||
|
||||
void ClearEntries(std::vector<std::string> entries);
|
||||
|
||||
|
||||
void Execute(std::vector<std::string> commands);
|
||||
void Execute_Single_Command(std::string command);
|
||||
|
||||
@ -80,24 +80,28 @@ class DynamicCacheBasedLanguageModel : public StatelessFeatureFunction
|
||||
|
||||
protected:
|
||||
static DynamicCacheBasedLanguageModel *s_instance;
|
||||
static std::map< const std::string, DynamicCacheBasedLanguageModel * > s_instance_map;
|
||||
static std::map< const std::string, DynamicCacheBasedLanguageModel * > s_instance_map;
|
||||
|
||||
public:
|
||||
DynamicCacheBasedLanguageModel(const std::string &line);
|
||||
~DynamicCacheBasedLanguageModel();
|
||||
|
||||
inline const std::string GetName() { return m_name; };
|
||||
inline void SetName(const std::string name){ m_name = name; }
|
||||
|
||||
static const DynamicCacheBasedLanguageModel& Instance(const std::string name) {
|
||||
UTIL_THROW_IF2(s_instance_map.find(name) == s_instance_map.end(), "The DynamicCacheBasedLanguageModel feature named " + name + " does not exist!");
|
||||
return *(s_instance_map[name]);
|
||||
}
|
||||
|
||||
static DynamicCacheBasedLanguageModel& InstanceNonConst(const std::string name) {
|
||||
UTIL_THROW_IF2(s_instance_map.find(name) == s_instance_map.end(), "The DynamicCacheBasedLanguageModel feature named " + name + " does not exist!");
|
||||
return *(s_instance_map[name]);
|
||||
}
|
||||
inline const std::string GetName() {
|
||||
return m_name;
|
||||
};
|
||||
inline void SetName(const std::string name) {
|
||||
m_name = name;
|
||||
}
|
||||
|
||||
static const DynamicCacheBasedLanguageModel& Instance(const std::string name) {
|
||||
UTIL_THROW_IF2(s_instance_map.find(name) == s_instance_map.end(), "The DynamicCacheBasedLanguageModel feature named " + name + " does not exist!");
|
||||
return *(s_instance_map[name]);
|
||||
}
|
||||
|
||||
static DynamicCacheBasedLanguageModel& InstanceNonConst(const std::string name) {
|
||||
UTIL_THROW_IF2(s_instance_map.find(name) == s_instance_map.end(), "The DynamicCacheBasedLanguageModel feature named " + name + " does not exist!");
|
||||
return *(s_instance_map[name]);
|
||||
}
|
||||
static const DynamicCacheBasedLanguageModel& Instance() {
|
||||
return *s_instance;
|
||||
}
|
||||
@ -113,7 +117,7 @@ public:
|
||||
void Load(const std::string file);
|
||||
void Execute(std::string command);
|
||||
void SetParameter(const std::string& key, const std::string& value);
|
||||
void ExecuteDlt(std::map<std::string, std::string> dlt_meta);
|
||||
void ExecuteDlt(std::map<std::string, std::string> dlt_meta);
|
||||
|
||||
void ClearEntries(std::string &entries);
|
||||
void Insert(std::string &entries);
|
||||
|
@ -250,13 +250,13 @@ void FeatureRegistry::Construct(const std::string &name, const std::string &line
|
||||
|
||||
void FeatureRegistry::PrintFF() const
|
||||
{
|
||||
std::cerr << "Available feature functions:" << std::endl;
|
||||
Map::const_iterator iter;
|
||||
for (iter = registry_.begin(); iter != registry_.end(); ++iter) {
|
||||
const string &ffName = iter->first;
|
||||
std::cerr << ffName << " ";
|
||||
}
|
||||
std::cerr << std::endl;
|
||||
std::cerr << "Available feature functions:" << std::endl;
|
||||
Map::const_iterator iter;
|
||||
for (iter = registry_.begin(); iter != registry_.end(); ++iter) {
|
||||
const string &ffName = iter->first;
|
||||
std::cerr << ffName << " ";
|
||||
}
|
||||
std::cerr << std::endl;
|
||||
}
|
||||
|
||||
} // namespace Moses
|
||||
|
@ -7,7 +7,7 @@ namespace Moses
|
||||
{
|
||||
|
||||
HyperParameterAsWeight::HyperParameterAsWeight(const std::string &line)
|
||||
:StatelessFeatureFunction(2, line)
|
||||
:StatelessFeatureFunction(2, line)
|
||||
{
|
||||
ReadParameters();
|
||||
|
||||
|
@ -14,8 +14,9 @@ class HyperParameterAsWeight : public StatelessFeatureFunction
|
||||
public:
|
||||
HyperParameterAsWeight(const std::string &line);
|
||||
|
||||
virtual bool IsUseable(const FactorMask &mask) const
|
||||
{ return true; }
|
||||
virtual bool IsUseable(const FactorMask &mask) const {
|
||||
return true;
|
||||
}
|
||||
|
||||
virtual void Evaluate(const Phrase &source
|
||||
, const TargetPhrase &targetPhrase
|
||||
|
@ -18,14 +18,14 @@ InputFeature::InputFeature(const std::string &line)
|
||||
{
|
||||
m_numInputScores = this->m_numScoreComponents;
|
||||
ReadParameters();
|
||||
|
||||
|
||||
UTIL_THROW_IF2(s_instance, "Can only have 1 input feature");
|
||||
s_instance = this;
|
||||
}
|
||||
|
||||
void InputFeature::Load()
|
||||
{
|
||||
|
||||
|
||||
const PhraseDictionary *pt = PhraseDictionary::GetColl()[0];
|
||||
const PhraseDictionaryTreeAdaptor *ptBin = dynamic_cast<const PhraseDictionaryTreeAdaptor*>(pt);
|
||||
|
||||
|
@ -3,7 +3,7 @@
|
||||
namespace Moses
|
||||
{
|
||||
ReferenceComparison::ReferenceComparison(const std::string &line)
|
||||
:StatelessFeatureFunction(0, line)
|
||||
:StatelessFeatureFunction(0, line)
|
||||
{
|
||||
}
|
||||
|
||||
|
@ -10,34 +10,36 @@ namespace Moses
|
||||
class ReferenceComparison : public StatelessFeatureFunction
|
||||
{
|
||||
public:
|
||||
ReferenceComparison(const std::string &line);
|
||||
ReferenceComparison(const std::string &line);
|
||||
|
||||
virtual bool IsUseable(const FactorMask &mask) const
|
||||
{ return true; }
|
||||
virtual bool IsUseable(const FactorMask &mask) const {
|
||||
return true;
|
||||
}
|
||||
|
||||
virtual void Evaluate(const Phrase &source
|
||||
, const TargetPhrase &targetPhrase
|
||||
, ScoreComponentCollection &scoreBreakdown
|
||||
, ScoreComponentCollection &estimatedFutureScore) const
|
||||
{}
|
||||
virtual void Evaluate(const Phrase &source
|
||||
, const TargetPhrase &targetPhrase
|
||||
, ScoreComponentCollection &scoreBreakdown
|
||||
, ScoreComponentCollection &estimatedFutureScore) const
|
||||
{}
|
||||
|
||||
virtual void Evaluate(const InputType &input
|
||||
, const InputPath &inputPath
|
||||
, const TargetPhrase &targetPhrase
|
||||
, ScoreComponentCollection &scoreBreakdown
|
||||
, ScoreComponentCollection *estimatedFutureScore = NULL) const
|
||||
{}
|
||||
virtual void Evaluate(const InputType &input
|
||||
, const InputPath &inputPath
|
||||
, const TargetPhrase &targetPhrase
|
||||
, ScoreComponentCollection &scoreBreakdown
|
||||
, ScoreComponentCollection *estimatedFutureScore = NULL) const
|
||||
{}
|
||||
|
||||
virtual void Evaluate(const Hypothesis& hypo,
|
||||
ScoreComponentCollection* accumulator) const
|
||||
{}
|
||||
virtual void Evaluate(const Hypothesis& hypo,
|
||||
ScoreComponentCollection* accumulator) const
|
||||
{}
|
||||
|
||||
virtual void EvaluateChart(const ChartHypothesis &hypo,
|
||||
ScoreComponentCollection* accumulator) const
|
||||
{}
|
||||
virtual void EvaluateChart(const ChartHypothesis &hypo,
|
||||
ScoreComponentCollection* accumulator) const
|
||||
{}
|
||||
|
||||
std::vector<float> DefaultWeights() const
|
||||
{ return std::vector<float>(); }
|
||||
std::vector<float> DefaultWeights() const {
|
||||
return std::vector<float>();
|
||||
}
|
||||
|
||||
protected:
|
||||
|
||||
|
@ -5,8 +5,8 @@
|
||||
namespace Moses
|
||||
{
|
||||
RuleAmbiguity::RuleAmbiguity(const std::string &line)
|
||||
:StatelessFeatureFunction(1, line)
|
||||
,m_sourceSyntax(true)
|
||||
:StatelessFeatureFunction(1, line)
|
||||
,m_sourceSyntax(true)
|
||||
{
|
||||
}
|
||||
|
||||
@ -17,32 +17,31 @@ bool IsAmbiguous(const Word &word, bool sourceSyntax)
|
||||
}
|
||||
|
||||
void RuleAmbiguity::Evaluate(const Phrase &source
|
||||
, const TargetPhrase &targetPhrase
|
||||
, ScoreComponentCollection &scoreBreakdown
|
||||
, ScoreComponentCollection &estimatedFutureScore) const
|
||||
, const TargetPhrase &targetPhrase
|
||||
, ScoreComponentCollection &scoreBreakdown
|
||||
, ScoreComponentCollection &estimatedFutureScore) const
|
||||
{
|
||||
// source can't be empty, right?
|
||||
float score = 0;
|
||||
|
||||
int count = 0;
|
||||
for (size_t i = 0; i < source.GetSize() - 0; ++i) {
|
||||
const Word &word = source.GetWord(i);
|
||||
bool ambiguous = IsAmbiguous(word, m_sourceSyntax);
|
||||
if (ambiguous) {
|
||||
++count;
|
||||
}
|
||||
else {
|
||||
if (count > 0) {
|
||||
score += count;
|
||||
}
|
||||
count = -1;
|
||||
}
|
||||
const Word &word = source.GetWord(i);
|
||||
bool ambiguous = IsAmbiguous(word, m_sourceSyntax);
|
||||
if (ambiguous) {
|
||||
++count;
|
||||
} else {
|
||||
if (count > 0) {
|
||||
score += count;
|
||||
}
|
||||
count = -1;
|
||||
}
|
||||
}
|
||||
|
||||
// 1st & last always adjacent to ambiguity
|
||||
++count;
|
||||
if (count > 0) {
|
||||
score += count;
|
||||
score += count;
|
||||
}
|
||||
|
||||
scoreBreakdown.PlusEquals(this, score);
|
||||
@ -51,7 +50,7 @@ void RuleAmbiguity::Evaluate(const Phrase &source
|
||||
void RuleAmbiguity::SetParameter(const std::string& key, const std::string& value)
|
||||
{
|
||||
if (key == "source-syntax") {
|
||||
m_sourceSyntax = Scan<bool>(value);
|
||||
m_sourceSyntax = Scan<bool>(value);
|
||||
} else {
|
||||
StatelessFeatureFunction::SetParameter(key, value);
|
||||
}
|
||||
|
@ -9,32 +9,33 @@ namespace Moses
|
||||
class RuleAmbiguity : public StatelessFeatureFunction
|
||||
{
|
||||
public:
|
||||
RuleAmbiguity(const std::string &line);
|
||||
RuleAmbiguity(const std::string &line);
|
||||
|
||||
virtual bool IsUseable(const FactorMask &mask) const
|
||||
{ return true; }
|
||||
virtual bool IsUseable(const FactorMask &mask) const {
|
||||
return true;
|
||||
}
|
||||
|
||||
virtual void Evaluate(const Phrase &source
|
||||
, const TargetPhrase &targetPhrase
|
||||
, ScoreComponentCollection &scoreBreakdown
|
||||
, ScoreComponentCollection &estimatedFutureScore) const;
|
||||
virtual void Evaluate(const Phrase &source
|
||||
, const TargetPhrase &targetPhrase
|
||||
, ScoreComponentCollection &scoreBreakdown
|
||||
, ScoreComponentCollection &estimatedFutureScore) const;
|
||||
|
||||
virtual void Evaluate(const InputType &input
|
||||
, const InputPath &inputPath
|
||||
, const TargetPhrase &targetPhrase
|
||||
, ScoreComponentCollection &scoreBreakdown
|
||||
, ScoreComponentCollection *estimatedFutureScore = NULL) const
|
||||
{}
|
||||
virtual void Evaluate(const InputType &input
|
||||
, const InputPath &inputPath
|
||||
, const TargetPhrase &targetPhrase
|
||||
, ScoreComponentCollection &scoreBreakdown
|
||||
, ScoreComponentCollection *estimatedFutureScore = NULL) const
|
||||
{}
|
||||
|
||||
virtual void Evaluate(const Hypothesis& hypo,
|
||||
ScoreComponentCollection* accumulator) const
|
||||
{}
|
||||
virtual void Evaluate(const Hypothesis& hypo,
|
||||
ScoreComponentCollection* accumulator) const
|
||||
{}
|
||||
|
||||
virtual void EvaluateChart(const ChartHypothesis &hypo,
|
||||
ScoreComponentCollection* accumulator) const
|
||||
{}
|
||||
virtual void EvaluateChart(const ChartHypothesis &hypo,
|
||||
ScoreComponentCollection* accumulator) const
|
||||
{}
|
||||
|
||||
void SetParameter(const std::string& key, const std::string& value);
|
||||
void SetParameter(const std::string& key, const std::string& value);
|
||||
|
||||
protected:
|
||||
bool m_sourceSyntax;
|
||||
|
@ -4,18 +4,18 @@
|
||||
namespace Moses
|
||||
{
|
||||
SetSourcePhrase::SetSourcePhrase(const std::string &line)
|
||||
:StatelessFeatureFunction(1, line)
|
||||
:StatelessFeatureFunction(1, line)
|
||||
{
|
||||
m_tuneable = false;
|
||||
ReadParameters();
|
||||
}
|
||||
|
||||
void SetSourcePhrase::Evaluate(const Phrase &source
|
||||
, const TargetPhrase &targetPhrase
|
||||
, ScoreComponentCollection &scoreBreakdown
|
||||
, ScoreComponentCollection &estimatedFutureScore) const
|
||||
, const TargetPhrase &targetPhrase
|
||||
, ScoreComponentCollection &scoreBreakdown
|
||||
, ScoreComponentCollection &estimatedFutureScore) const
|
||||
{
|
||||
targetPhrase.SetRuleSource(source);
|
||||
targetPhrase.SetRuleSource(source);
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -11,19 +11,20 @@ class SetSourcePhrase : public StatelessFeatureFunction
|
||||
public:
|
||||
SetSourcePhrase(const std::string &line);
|
||||
|
||||
virtual bool IsUseable(const FactorMask &mask) const
|
||||
{ return true; }
|
||||
virtual bool IsUseable(const FactorMask &mask) const {
|
||||
return true;
|
||||
}
|
||||
|
||||
virtual void Evaluate(const Phrase &source
|
||||
, const TargetPhrase &targetPhrase
|
||||
, ScoreComponentCollection &scoreBreakdown
|
||||
, ScoreComponentCollection &estimatedFutureScore) const;
|
||||
, const TargetPhrase &targetPhrase
|
||||
, ScoreComponentCollection &scoreBreakdown
|
||||
, ScoreComponentCollection &estimatedFutureScore) const;
|
||||
|
||||
virtual void Evaluate(const InputType &input
|
||||
, const InputPath &inputPath
|
||||
, const TargetPhrase &targetPhrase
|
||||
, ScoreComponentCollection &scoreBreakdown
|
||||
, ScoreComponentCollection *estimatedFutureScore = NULL) const
|
||||
, const InputPath &inputPath
|
||||
, const TargetPhrase &targetPhrase
|
||||
, ScoreComponentCollection &scoreBreakdown
|
||||
, ScoreComponentCollection *estimatedFutureScore = NULL) const
|
||||
{}
|
||||
|
||||
virtual void Evaluate(const Hypothesis& hypo,
|
||||
@ -34,8 +35,9 @@ public:
|
||||
ScoreComponentCollection* accumulator) const
|
||||
{}
|
||||
|
||||
std::vector<float> DefaultWeights() const
|
||||
{ return std::vector<float>(); }
|
||||
std::vector<float> DefaultWeights() const {
|
||||
return std::vector<float>();
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
|
@ -24,8 +24,8 @@ void SoftMatchingFeature::SetParameter(const std::string& key, const std::string
|
||||
m_tuneable = Scan<bool>(value);
|
||||
} else if (key == "filterable") { //ignore
|
||||
} else if (key == "path") {
|
||||
const std::string filePath = value;
|
||||
Load(filePath);
|
||||
const std::string filePath = value;
|
||||
Load(filePath);
|
||||
} else {
|
||||
UTIL_THROW(util::Exception, "Unknown argument " << key << "=" << value);
|
||||
}
|
||||
@ -35,34 +35,34 @@ void SoftMatchingFeature::SetParameter(const std::string& key, const std::string
|
||||
bool SoftMatchingFeature::Load(const std::string& filePath)
|
||||
{
|
||||
|
||||
StaticData &staticData = StaticData::InstanceNonConst();
|
||||
StaticData &staticData = StaticData::InstanceNonConst();
|
||||
|
||||
InputFileStream inStream(filePath);
|
||||
std::string line;
|
||||
while(getline(inStream, line)) {
|
||||
std::vector<std::string> tokens = Tokenize(line);
|
||||
UTIL_THROW_IF2(tokens.size() != 2, "Error: wrong format of SoftMatching file: must have two nonterminals per line");
|
||||
InputFileStream inStream(filePath);
|
||||
std::string line;
|
||||
while(getline(inStream, line)) {
|
||||
std::vector<std::string> tokens = Tokenize(line);
|
||||
UTIL_THROW_IF2(tokens.size() != 2, "Error: wrong format of SoftMatching file: must have two nonterminals per line");
|
||||
|
||||
// no soft matching necessary if LHS and RHS are the same
|
||||
if (tokens[0] == tokens[1]) {
|
||||
continue;
|
||||
}
|
||||
|
||||
Word LHS, RHS;
|
||||
LHS.CreateFromString(Output, staticData.GetOutputFactorOrder(), tokens[0], true);
|
||||
RHS.CreateFromString(Output, staticData.GetOutputFactorOrder(), tokens[1], true);
|
||||
|
||||
m_softMatches[RHS[0]->GetId()].push_back(LHS);
|
||||
GetOrSetFeatureName(RHS, LHS);
|
||||
// no soft matching necessary if LHS and RHS are the same
|
||||
if (tokens[0] == tokens[1]) {
|
||||
continue;
|
||||
}
|
||||
|
||||
staticData.SetSoftMatches(m_softMatches);
|
||||
Word LHS, RHS;
|
||||
LHS.CreateFromString(Output, staticData.GetOutputFactorOrder(), tokens[0], true);
|
||||
RHS.CreateFromString(Output, staticData.GetOutputFactorOrder(), tokens[1], true);
|
||||
|
||||
return true;
|
||||
m_softMatches[RHS[0]->GetId()].push_back(LHS);
|
||||
GetOrSetFeatureName(RHS, LHS);
|
||||
}
|
||||
|
||||
staticData.SetSoftMatches(m_softMatches);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void SoftMatchingFeature::EvaluateChart(const ChartHypothesis& hypo,
|
||||
ScoreComponentCollection* accumulator) const
|
||||
ScoreComponentCollection* accumulator) const
|
||||
{
|
||||
|
||||
const TargetPhrase& target = hypo.GetCurrTargetPhrase();
|
||||
@ -87,7 +87,8 @@ void SoftMatchingFeature::EvaluateChart(const ChartHypothesis& hypo,
|
||||
}
|
||||
|
||||
// when loading, or when we notice that non-terminals have been added after loading, we resize vectors
|
||||
void SoftMatchingFeature::ResizeCache() const {
|
||||
void SoftMatchingFeature::ResizeCache() const
|
||||
{
|
||||
FactorCollection& fc = FactorCollection::Instance();
|
||||
size_t numNonTerminals = fc.GetNumNonTerminals();
|
||||
|
||||
@ -98,7 +99,8 @@ void SoftMatchingFeature::ResizeCache() const {
|
||||
}
|
||||
|
||||
|
||||
const std::string& SoftMatchingFeature::GetOrSetFeatureName(const Word& RHS, const Word& LHS) const {
|
||||
const std::string& SoftMatchingFeature::GetOrSetFeatureName(const Word& RHS, const Word& LHS) const
|
||||
{
|
||||
try {
|
||||
#ifdef WITH_THREADS //try read-only lock
|
||||
boost::shared_lock<boost::shared_mutex> read_lock(m_accessLock);
|
||||
@ -107,23 +109,22 @@ const std::string& SoftMatchingFeature::GetOrSetFeatureName(const Word& RHS, con
|
||||
if (!name.empty()) {
|
||||
return name;
|
||||
}
|
||||
}
|
||||
catch (const std::out_of_range& oor) {
|
||||
} catch (const std::out_of_range& oor) {
|
||||
#ifdef WITH_THREADS //need to resize cache; write lock
|
||||
boost::unique_lock<boost::shared_mutex> lock(m_accessLock);
|
||||
#endif
|
||||
ResizeCache();
|
||||
}
|
||||
#ifdef WITH_THREADS //need to update cache; write lock
|
||||
boost::unique_lock<boost::shared_mutex> lock(m_accessLock);
|
||||
boost::unique_lock<boost::shared_mutex> lock(m_accessLock);
|
||||
#endif
|
||||
std::string &name = m_nameCache[RHS[0]->GetId()][LHS[0]->GetId()];
|
||||
const std::vector<FactorType> &outputFactorOrder = StaticData::Instance().GetOutputFactorOrder();
|
||||
std::string LHS_string = LHS.GetString(outputFactorOrder, false);
|
||||
std::string RHS_string = RHS.GetString(outputFactorOrder, false);
|
||||
name = LHS_string + "->" + RHS_string;
|
||||
return name;
|
||||
}
|
||||
std::string &name = m_nameCache[RHS[0]->GetId()][LHS[0]->GetId()];
|
||||
const std::vector<FactorType> &outputFactorOrder = StaticData::Instance().GetOutputFactorOrder();
|
||||
std::string LHS_string = LHS.GetString(outputFactorOrder, false);
|
||||
std::string RHS_string = RHS.GetString(outputFactorOrder, false);
|
||||
name = LHS_string + "->" + RHS_string;
|
||||
return name;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
@ -13,230 +13,237 @@ namespace Moses
|
||||
{
|
||||
|
||||
InternalTree::InternalTree(const std::string & line, const bool terminal):
|
||||
m_value_nt(0),
|
||||
m_isTerminal(terminal)
|
||||
{
|
||||
m_value_nt(0),
|
||||
m_isTerminal(terminal)
|
||||
{
|
||||
|
||||
size_t found = line.find_first_of("[] ");
|
||||
size_t found = line.find_first_of("[] ");
|
||||
|
||||
if (found == line.npos) {
|
||||
m_value = line;
|
||||
}
|
||||
if (found == line.npos) {
|
||||
m_value = line;
|
||||
}
|
||||
|
||||
else {
|
||||
AddSubTree(line, 0);
|
||||
}
|
||||
else {
|
||||
AddSubTree(line, 0);
|
||||
}
|
||||
}
|
||||
|
||||
size_t InternalTree::AddSubTree(const std::string & line, size_t pos) {
|
||||
size_t InternalTree::AddSubTree(const std::string & line, size_t pos)
|
||||
{
|
||||
|
||||
std::string value = "";
|
||||
char token = 0;
|
||||
std::string value = "";
|
||||
char token = 0;
|
||||
|
||||
while (token != ']' && pos != std::string::npos)
|
||||
{
|
||||
size_t oldpos = pos;
|
||||
pos = line.find_first_of("[] ", pos);
|
||||
if (pos == std::string::npos) break;
|
||||
token = line[pos];
|
||||
value = line.substr(oldpos,pos-oldpos);
|
||||
while (token != ']' && pos != std::string::npos) {
|
||||
size_t oldpos = pos;
|
||||
pos = line.find_first_of("[] ", pos);
|
||||
if (pos == std::string::npos) break;
|
||||
token = line[pos];
|
||||
value = line.substr(oldpos,pos-oldpos);
|
||||
|
||||
if (token == '[') {
|
||||
if (m_value.size() > 0) {
|
||||
TreePointer child(new InternalTree(value, false));
|
||||
m_children.push_back(child);
|
||||
pos = child->AddSubTree(line, pos+1);
|
||||
}
|
||||
else {
|
||||
if (value.size() > 0) {
|
||||
m_value = value;
|
||||
}
|
||||
pos = AddSubTree(line, pos+1);
|
||||
}
|
||||
}
|
||||
else if (token == ' ' || token == ']') {
|
||||
if (value.size() > 0 && ! m_value.size() > 0) {
|
||||
m_value = value;
|
||||
}
|
||||
else if (value.size() > 0) {
|
||||
m_isTerminal = false;
|
||||
TreePointer child(new InternalTree(value, true));
|
||||
m_children.push_back(child);
|
||||
}
|
||||
if (token == ' ') {
|
||||
pos++;
|
||||
}
|
||||
}
|
||||
|
||||
if (m_children.size() > 0) {
|
||||
m_isTerminal = false;
|
||||
if (token == '[') {
|
||||
if (m_value.size() > 0) {
|
||||
TreePointer child(new InternalTree(value, false));
|
||||
m_children.push_back(child);
|
||||
pos = child->AddSubTree(line, pos+1);
|
||||
} else {
|
||||
if (value.size() > 0) {
|
||||
m_value = value;
|
||||
}
|
||||
pos = AddSubTree(line, pos+1);
|
||||
}
|
||||
} else if (token == ' ' || token == ']') {
|
||||
if (value.size() > 0 && ! m_value.size() > 0) {
|
||||
m_value = value;
|
||||
} else if (value.size() > 0) {
|
||||
m_isTerminal = false;
|
||||
TreePointer child(new InternalTree(value, true));
|
||||
m_children.push_back(child);
|
||||
}
|
||||
if (token == ' ') {
|
||||
pos++;
|
||||
}
|
||||
}
|
||||
|
||||
if (pos == std::string::npos) {
|
||||
return line.size();
|
||||
if (m_children.size() > 0) {
|
||||
m_isTerminal = false;
|
||||
}
|
||||
return min(line.size(),pos+1);
|
||||
}
|
||||
|
||||
if (pos == std::string::npos) {
|
||||
return line.size();
|
||||
}
|
||||
return min(line.size(),pos+1);
|
||||
|
||||
}
|
||||
|
||||
std::string InternalTree::GetString() const {
|
||||
std::string InternalTree::GetString() const
|
||||
{
|
||||
|
||||
std::string ret = " ";
|
||||
std::string ret = " ";
|
||||
|
||||
if (!m_isTerminal) {
|
||||
ret += "[";
|
||||
}
|
||||
if (!m_isTerminal) {
|
||||
ret += "[";
|
||||
}
|
||||
|
||||
ret += m_value;
|
||||
for (std::vector<TreePointer>::const_iterator it = m_children.begin(); it != m_children.end(); ++it)
|
||||
{
|
||||
ret += (*it)->GetString();
|
||||
}
|
||||
ret += m_value;
|
||||
for (std::vector<TreePointer>::const_iterator it = m_children.begin(); it != m_children.end(); ++it) {
|
||||
ret += (*it)->GetString();
|
||||
}
|
||||
|
||||
if (!m_isTerminal) {
|
||||
ret += "]";
|
||||
}
|
||||
return ret;
|
||||
if (!m_isTerminal) {
|
||||
ret += "]";
|
||||
}
|
||||
return ret;
|
||||
|
||||
}
|
||||
|
||||
|
||||
void InternalTree::Combine(const std::vector<TreePointer> &previous) {
|
||||
void InternalTree::Combine(const std::vector<TreePointer> &previous)
|
||||
{
|
||||
|
||||
std::vector<TreePointer>::iterator it;
|
||||
bool found = false;
|
||||
leafNT next_leafNT(this);
|
||||
for (std::vector<TreePointer>::const_iterator it_prev = previous.begin(); it_prev != previous.end(); ++it_prev) {
|
||||
found = next_leafNT(it);
|
||||
if (found) {
|
||||
*it = *it_prev;
|
||||
}
|
||||
else {
|
||||
std::cerr << "Warning: leaf nonterminal not found in rule; why did this happen?\n";
|
||||
}
|
||||
std::vector<TreePointer>::iterator it;
|
||||
bool found = false;
|
||||
leafNT next_leafNT(this);
|
||||
for (std::vector<TreePointer>::const_iterator it_prev = previous.begin(); it_prev != previous.end(); ++it_prev) {
|
||||
found = next_leafNT(it);
|
||||
if (found) {
|
||||
*it = *it_prev;
|
||||
} else {
|
||||
std::cerr << "Warning: leaf nonterminal not found in rule; why did this happen?\n";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
bool InternalTree::FlatSearch(const std::string & label, std::vector<TreePointer>::const_iterator & it) const {
|
||||
for (it = m_children.begin(); it != m_children.end(); ++it) {
|
||||
if ((*it)->GetLabel() == label) {
|
||||
return true;
|
||||
}
|
||||
bool InternalTree::FlatSearch(const std::string & label, std::vector<TreePointer>::const_iterator & it) const
|
||||
{
|
||||
for (it = m_children.begin(); it != m_children.end(); ++it) {
|
||||
if ((*it)->GetLabel() == label) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
bool InternalTree::RecursiveSearch(const std::string & label, std::vector<TreePointer>::const_iterator & it) const {
|
||||
for (it = m_children.begin(); it != m_children.end(); ++it) {
|
||||
if ((*it)->GetLabel() == label) {
|
||||
return true;
|
||||
}
|
||||
std::vector<TreePointer>::const_iterator it2;
|
||||
if ((*it)->RecursiveSearch(label, it2)) {
|
||||
it = it2;
|
||||
return true;
|
||||
}
|
||||
bool InternalTree::RecursiveSearch(const std::string & label, std::vector<TreePointer>::const_iterator & it) const
|
||||
{
|
||||
for (it = m_children.begin(); it != m_children.end(); ++it) {
|
||||
if ((*it)->GetLabel() == label) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
std::vector<TreePointer>::const_iterator it2;
|
||||
if ((*it)->RecursiveSearch(label, it2)) {
|
||||
it = it2;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
bool InternalTree::RecursiveSearch(const std::string & label, std::vector<TreePointer>::const_iterator & it, InternalTree const* &parent) const {
|
||||
for (it = m_children.begin(); it != m_children.end(); ++it) {
|
||||
if ((*it)->GetLabel() == label) {
|
||||
parent = this;
|
||||
return true;
|
||||
}
|
||||
std::vector<TreePointer>::const_iterator it2;
|
||||
if ((*it)->RecursiveSearch(label, it2, parent)) {
|
||||
it = it2;
|
||||
return true;
|
||||
}
|
||||
bool InternalTree::RecursiveSearch(const std::string & label, std::vector<TreePointer>::const_iterator & it, InternalTree const* &parent) const
|
||||
{
|
||||
for (it = m_children.begin(); it != m_children.end(); ++it) {
|
||||
if ((*it)->GetLabel() == label) {
|
||||
parent = this;
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
std::vector<TreePointer>::const_iterator it2;
|
||||
if ((*it)->RecursiveSearch(label, it2, parent)) {
|
||||
it = it2;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
bool InternalTree::FlatSearch(const NTLabel & label, std::vector<TreePointer>::const_iterator & it) const {
|
||||
for (it = m_children.begin(); it != m_children.end(); ++it) {
|
||||
if ((*it)->GetNTLabel() == label) {
|
||||
return true;
|
||||
}
|
||||
bool InternalTree::FlatSearch(const NTLabel & label, std::vector<TreePointer>::const_iterator & it) const
|
||||
{
|
||||
for (it = m_children.begin(); it != m_children.end(); ++it) {
|
||||
if ((*it)->GetNTLabel() == label) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
bool InternalTree::RecursiveSearch(const NTLabel & label, std::vector<TreePointer>::const_iterator & it) const {
|
||||
for (it = m_children.begin(); it != m_children.end(); ++it) {
|
||||
if ((*it)->GetNTLabel() == label) {
|
||||
return true;
|
||||
}
|
||||
std::vector<TreePointer>::const_iterator it2;
|
||||
if ((*it)->RecursiveSearch(label, it2)) {
|
||||
it = it2;
|
||||
return true;
|
||||
}
|
||||
bool InternalTree::RecursiveSearch(const NTLabel & label, std::vector<TreePointer>::const_iterator & it) const
|
||||
{
|
||||
for (it = m_children.begin(); it != m_children.end(); ++it) {
|
||||
if ((*it)->GetNTLabel() == label) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
std::vector<TreePointer>::const_iterator it2;
|
||||
if ((*it)->RecursiveSearch(label, it2)) {
|
||||
it = it2;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
bool InternalTree::RecursiveSearch(const NTLabel & label, std::vector<TreePointer>::const_iterator & it, InternalTree const* &parent) const {
|
||||
for (it = m_children.begin(); it != m_children.end(); ++it) {
|
||||
if ((*it)->GetNTLabel() == label) {
|
||||
parent = this;
|
||||
return true;
|
||||
}
|
||||
std::vector<TreePointer>::const_iterator it2;
|
||||
if ((*it)->RecursiveSearch(label, it2, parent)) {
|
||||
it = it2;
|
||||
return true;
|
||||
}
|
||||
bool InternalTree::RecursiveSearch(const NTLabel & label, std::vector<TreePointer>::const_iterator & it, InternalTree const* &parent) const
|
||||
{
|
||||
for (it = m_children.begin(); it != m_children.end(); ++it) {
|
||||
if ((*it)->GetNTLabel() == label) {
|
||||
parent = this;
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
std::vector<TreePointer>::const_iterator it2;
|
||||
if ((*it)->RecursiveSearch(label, it2, parent)) {
|
||||
it = it2;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
bool InternalTree::FlatSearch(const std::vector<NTLabel> & labels, std::vector<TreePointer>::const_iterator & it) const {
|
||||
for (it = m_children.begin(); it != m_children.end(); ++it) {
|
||||
if (std::binary_search(labels.begin(), labels.end(), (*it)->GetNTLabel())) {
|
||||
return true;
|
||||
}
|
||||
bool InternalTree::FlatSearch(const std::vector<NTLabel> & labels, std::vector<TreePointer>::const_iterator & it) const
|
||||
{
|
||||
for (it = m_children.begin(); it != m_children.end(); ++it) {
|
||||
if (std::binary_search(labels.begin(), labels.end(), (*it)->GetNTLabel())) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
bool InternalTree::RecursiveSearch(const std::vector<NTLabel> & labels, std::vector<TreePointer>::const_iterator & it) const {
|
||||
for (it = m_children.begin(); it != m_children.end(); ++it) {
|
||||
if (std::binary_search(labels.begin(), labels.end(), (*it)->GetNTLabel())) {
|
||||
return true;
|
||||
}
|
||||
std::vector<TreePointer>::const_iterator it2;
|
||||
if ((*it)->RecursiveSearch(labels, it2)) {
|
||||
it = it2;
|
||||
return true;
|
||||
}
|
||||
bool InternalTree::RecursiveSearch(const std::vector<NTLabel> & labels, std::vector<TreePointer>::const_iterator & it) const
|
||||
{
|
||||
for (it = m_children.begin(); it != m_children.end(); ++it) {
|
||||
if (std::binary_search(labels.begin(), labels.end(), (*it)->GetNTLabel())) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
std::vector<TreePointer>::const_iterator it2;
|
||||
if ((*it)->RecursiveSearch(labels, it2)) {
|
||||
it = it2;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
bool InternalTree::RecursiveSearch(const std::vector<NTLabel> & labels, std::vector<TreePointer>::const_iterator & it, InternalTree const* &parent) const {
|
||||
for (it = m_children.begin(); it != m_children.end(); ++it) {
|
||||
if (std::binary_search(labels.begin(), labels.end(), (*it)->GetNTLabel())) {
|
||||
parent = this;
|
||||
return true;
|
||||
}
|
||||
std::vector<TreePointer>::const_iterator it2;
|
||||
if ((*it)->RecursiveSearch(labels, it2, parent)) {
|
||||
it = it2;
|
||||
return true;
|
||||
}
|
||||
bool InternalTree::RecursiveSearch(const std::vector<NTLabel> & labels, std::vector<TreePointer>::const_iterator & it, InternalTree const* &parent) const
|
||||
{
|
||||
for (it = m_children.begin(); it != m_children.end(); ++it) {
|
||||
if (std::binary_search(labels.begin(), labels.end(), (*it)->GetNTLabel())) {
|
||||
parent = this;
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
std::vector<TreePointer>::const_iterator it2;
|
||||
if ((*it)->RecursiveSearch(labels, it2, parent)) {
|
||||
it = it2;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
void TreeStructureFeature::Load() {
|
||||
void TreeStructureFeature::Load()
|
||||
{
|
||||
|
||||
// syntactic constraints can be hooked in here.
|
||||
m_constraints = NULL;
|
||||
@ -248,27 +255,28 @@ void TreeStructureFeature::Load() {
|
||||
|
||||
|
||||
// define NT labels (ints) that are mapped from strings for quicker comparison.
|
||||
void TreeStructureFeature::AddNTLabels(TreePointer root) const {
|
||||
std::string label = root->GetLabel();
|
||||
void TreeStructureFeature::AddNTLabels(TreePointer root) const
|
||||
{
|
||||
std::string label = root->GetLabel();
|
||||
|
||||
if (root->IsTerminal()) {
|
||||
return;
|
||||
}
|
||||
if (root->IsTerminal()) {
|
||||
return;
|
||||
}
|
||||
|
||||
std::map<std::string, NTLabel>::const_iterator it = m_labelset->string_to_label.find(label);
|
||||
if (it != m_labelset->string_to_label.end()) {
|
||||
root->SetNTLabel(it->second);
|
||||
}
|
||||
std::map<std::string, NTLabel>::const_iterator it = m_labelset->string_to_label.find(label);
|
||||
if (it != m_labelset->string_to_label.end()) {
|
||||
root->SetNTLabel(it->second);
|
||||
}
|
||||
|
||||
std::vector<TreePointer> children = root->GetChildren();
|
||||
for (std::vector<TreePointer>::const_iterator it2 = children.begin(); it2 != children.end(); ++it2) {
|
||||
AddNTLabels(*it2);
|
||||
}
|
||||
std::vector<TreePointer> children = root->GetChildren();
|
||||
for (std::vector<TreePointer>::const_iterator it2 = children.begin(); it2 != children.end(); ++it2) {
|
||||
AddNTLabels(*it2);
|
||||
}
|
||||
}
|
||||
|
||||
FFState* TreeStructureFeature::EvaluateChart(const ChartHypothesis& cur_hypo
|
||||
, int featureID /* used to index the state in the previous hypotheses */
|
||||
, ScoreComponentCollection* accumulator) const
|
||||
, int featureID /* used to index the state in the previous hypotheses */
|
||||
, ScoreComponentCollection* accumulator) const
|
||||
{
|
||||
std::string tree;
|
||||
bool found = 0;
|
||||
@ -277,7 +285,7 @@ FFState* TreeStructureFeature::EvaluateChart(const ChartHypothesis& cur_hypo
|
||||
TreePointer mytree (new InternalTree(tree));
|
||||
|
||||
if (m_labelset) {
|
||||
AddNTLabels(mytree);
|
||||
AddNTLabels(mytree);
|
||||
}
|
||||
|
||||
//get subtrees (in target order)
|
||||
@ -304,8 +312,7 @@ FFState* TreeStructureFeature::EvaluateChart(const ChartHypothesis& cur_hypo
|
||||
accumulator->PlusEquals(this, *feature, 1);
|
||||
}
|
||||
return new TreeState(mytree);
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
UTIL_THROW2("Error: TreeStructureFeature active, but no internal tree structure found");
|
||||
}
|
||||
|
||||
|
@ -17,92 +17,91 @@ typedef int NTLabel;
|
||||
|
||||
class InternalTree
|
||||
{
|
||||
std::string m_value;
|
||||
NTLabel m_value_nt;
|
||||
std::vector<TreePointer> m_children;
|
||||
bool m_isTerminal;
|
||||
std::string m_value;
|
||||
NTLabel m_value_nt;
|
||||
std::vector<TreePointer> m_children;
|
||||
bool m_isTerminal;
|
||||
public:
|
||||
InternalTree(const std::string & line, const bool terminal = false);
|
||||
InternalTree(const InternalTree & tree):
|
||||
m_value(tree.m_value),
|
||||
m_isTerminal(tree.m_isTerminal) {
|
||||
const std::vector<TreePointer> & children = tree.m_children;
|
||||
for (std::vector<TreePointer>::const_iterator it = children.begin(); it != children.end(); it++) {
|
||||
TreePointer child (new InternalTree(**it));
|
||||
m_children.push_back(child);
|
||||
}
|
||||
}
|
||||
size_t AddSubTree(const std::string & line, size_t start);
|
||||
|
||||
std::string GetString() const;
|
||||
void Combine(const std::vector<TreePointer> &previous);
|
||||
const std::string & GetLabel() const {
|
||||
return m_value;
|
||||
InternalTree(const std::string & line, const bool terminal = false);
|
||||
InternalTree(const InternalTree & tree):
|
||||
m_value(tree.m_value),
|
||||
m_isTerminal(tree.m_isTerminal) {
|
||||
const std::vector<TreePointer> & children = tree.m_children;
|
||||
for (std::vector<TreePointer>::const_iterator it = children.begin(); it != children.end(); it++) {
|
||||
TreePointer child (new InternalTree(**it));
|
||||
m_children.push_back(child);
|
||||
}
|
||||
}
|
||||
size_t AddSubTree(const std::string & line, size_t start);
|
||||
|
||||
// optionally identify label by int instead of string;
|
||||
// allows abstraction if multiple nonterminal strings should map to same label.
|
||||
const NTLabel & GetNTLabel() const {
|
||||
return m_value_nt;
|
||||
}
|
||||
std::string GetString() const;
|
||||
void Combine(const std::vector<TreePointer> &previous);
|
||||
const std::string & GetLabel() const {
|
||||
return m_value;
|
||||
}
|
||||
|
||||
void SetNTLabel(NTLabel value) {
|
||||
m_value_nt = value;
|
||||
}
|
||||
// optionally identify label by int instead of string;
|
||||
// allows abstraction if multiple nonterminal strings should map to same label.
|
||||
const NTLabel & GetNTLabel() const {
|
||||
return m_value_nt;
|
||||
}
|
||||
|
||||
size_t GetLength() const {
|
||||
return m_children.size();
|
||||
}
|
||||
std::vector<TreePointer> & GetChildren() {
|
||||
return m_children;
|
||||
}
|
||||
void AddChild(TreePointer child) {
|
||||
m_children.push_back(child);
|
||||
}
|
||||
void SetNTLabel(NTLabel value) {
|
||||
m_value_nt = value;
|
||||
}
|
||||
|
||||
bool IsTerminal() const {
|
||||
return m_isTerminal;
|
||||
}
|
||||
size_t GetLength() const {
|
||||
return m_children.size();
|
||||
}
|
||||
std::vector<TreePointer> & GetChildren() {
|
||||
return m_children;
|
||||
}
|
||||
void AddChild(TreePointer child) {
|
||||
m_children.push_back(child);
|
||||
}
|
||||
|
||||
bool IsLeafNT() const {
|
||||
return (!m_isTerminal && m_children.size() == 0);
|
||||
}
|
||||
bool IsTerminal() const {
|
||||
return m_isTerminal;
|
||||
}
|
||||
|
||||
// different methods to search a tree (either just direct children (FlatSearch) or all children (RecursiveSearch)) for constituents.
|
||||
// can be used for formulating syntax constraints.
|
||||
bool IsLeafNT() const {
|
||||
return (!m_isTerminal && m_children.size() == 0);
|
||||
}
|
||||
|
||||
// if found, 'it' is iterator to first tree node that matches search string
|
||||
bool FlatSearch(const std::string & label, std::vector<TreePointer>::const_iterator & it) const;
|
||||
bool RecursiveSearch(const std::string & label, std::vector<TreePointer>::const_iterator & it) const;
|
||||
// different methods to search a tree (either just direct children (FlatSearch) or all children (RecursiveSearch)) for constituents.
|
||||
// can be used for formulating syntax constraints.
|
||||
|
||||
// if found, 'it' is iterator to first tree node that matches search string, and 'parent' to its parent node
|
||||
bool RecursiveSearch(const std::string & label, std::vector<TreePointer>::const_iterator & it, InternalTree const* &parent) const;
|
||||
// if found, 'it' is iterator to first tree node that matches search string
|
||||
bool FlatSearch(const std::string & label, std::vector<TreePointer>::const_iterator & it) const;
|
||||
bool RecursiveSearch(const std::string & label, std::vector<TreePointer>::const_iterator & it) const;
|
||||
|
||||
// use NTLabel for search to reduce number of string comparisons / deal with synonymous labels
|
||||
// if found, 'it' is iterator to first tree node that matches search string
|
||||
bool FlatSearch(const NTLabel & label, std::vector<TreePointer>::const_iterator & it) const;
|
||||
bool RecursiveSearch(const NTLabel & label, std::vector<TreePointer>::const_iterator & it) const;
|
||||
// if found, 'it' is iterator to first tree node that matches search string, and 'parent' to its parent node
|
||||
bool RecursiveSearch(const std::string & label, std::vector<TreePointer>::const_iterator & it, InternalTree const* &parent) const;
|
||||
|
||||
// if found, 'it' is iterator to first tree node that matches search string, and 'parent' to its parent node
|
||||
bool RecursiveSearch(const NTLabel & label, std::vector<TreePointer>::const_iterator & it, InternalTree const* &parent) const;
|
||||
// use NTLabel for search to reduce number of string comparisons / deal with synonymous labels
|
||||
// if found, 'it' is iterator to first tree node that matches search string
|
||||
bool FlatSearch(const NTLabel & label, std::vector<TreePointer>::const_iterator & it) const;
|
||||
bool RecursiveSearch(const NTLabel & label, std::vector<TreePointer>::const_iterator & it) const;
|
||||
|
||||
// pass vector of possible labels to search
|
||||
// if found, 'it' is iterator to first tree node that matches search string
|
||||
bool FlatSearch(const std::vector<NTLabel> & labels, std::vector<TreePointer>::const_iterator & it) const;
|
||||
bool RecursiveSearch(const std::vector<NTLabel> & labels, std::vector<TreePointer>::const_iterator & it) const;
|
||||
// if found, 'it' is iterator to first tree node that matches search string, and 'parent' to its parent node
|
||||
bool RecursiveSearch(const NTLabel & label, std::vector<TreePointer>::const_iterator & it, InternalTree const* &parent) const;
|
||||
|
||||
// if found, 'it' is iterator to first tree node that matches search string, and 'parent' to its parent node
|
||||
bool RecursiveSearch(const std::vector<NTLabel> & labels, std::vector<TreePointer>::const_iterator & it, InternalTree const* &parent) const;
|
||||
// pass vector of possible labels to search
|
||||
// if found, 'it' is iterator to first tree node that matches search string
|
||||
bool FlatSearch(const std::vector<NTLabel> & labels, std::vector<TreePointer>::const_iterator & it) const;
|
||||
bool RecursiveSearch(const std::vector<NTLabel> & labels, std::vector<TreePointer>::const_iterator & it) const;
|
||||
|
||||
// if found, 'it' is iterator to first tree node that matches search string, and 'parent' to its parent node
|
||||
bool RecursiveSearch(const std::vector<NTLabel> & labels, std::vector<TreePointer>::const_iterator & it, InternalTree const* &parent) const;
|
||||
|
||||
|
||||
};
|
||||
|
||||
// mapping from string nonterminal label to int representation.
|
||||
// allows abstraction if multiple nonterminal strings should map to same label.
|
||||
struct LabelSet
|
||||
{
|
||||
struct LabelSet {
|
||||
public:
|
||||
std::map<std::string, NTLabel> string_to_label;
|
||||
std::map<std::string, NTLabel> string_to_label;
|
||||
};
|
||||
|
||||
|
||||
@ -111,8 +110,8 @@ public:
|
||||
class SyntaxConstraints
|
||||
{
|
||||
public:
|
||||
virtual std::vector<std::string> SyntacticRules(TreePointer root, const std::vector<TreePointer> &previous) = 0;
|
||||
virtual ~SyntaxConstraints() {};
|
||||
virtual std::vector<std::string> SyntacticRules(TreePointer root, const std::vector<TreePointer> &previous) = 0;
|
||||
virtual ~SyntaxConstraints() {};
|
||||
};
|
||||
|
||||
|
||||
@ -125,10 +124,12 @@ public:
|
||||
{}
|
||||
|
||||
TreePointer GetTree() const {
|
||||
return m_tree;
|
||||
return m_tree;
|
||||
}
|
||||
|
||||
int Compare(const FFState& other) const {return 0;};
|
||||
int Compare(const FFState& other) const {
|
||||
return 0;
|
||||
};
|
||||
};
|
||||
|
||||
class TreeStructureFeature : public StatefulFeatureFunction
|
||||
@ -138,9 +139,11 @@ class TreeStructureFeature : public StatefulFeatureFunction
|
||||
public:
|
||||
TreeStructureFeature(const std::string &line)
|
||||
:StatefulFeatureFunction(0, line) {
|
||||
ReadParameters();
|
||||
}
|
||||
~TreeStructureFeature() {delete m_constraints;};
|
||||
ReadParameters();
|
||||
}
|
||||
~TreeStructureFeature() {
|
||||
delete m_constraints;
|
||||
};
|
||||
|
||||
virtual const FFState* EmptyHypothesisState(const InputType &input) const {
|
||||
return new TreeState(TreePointer());
|
||||
@ -164,7 +167,9 @@ public:
|
||||
FFState* Evaluate(
|
||||
const Hypothesis& cur_hypo,
|
||||
const FFState* prev_state,
|
||||
ScoreComponentCollection* accumulator) const {UTIL_THROW(util::Exception, "Not implemented");};
|
||||
ScoreComponentCollection* accumulator) const {
|
||||
UTIL_THROW(util::Exception, "Not implemented");
|
||||
};
|
||||
FFState* EvaluateChart(
|
||||
const ChartHypothesis& /* cur_hypo */,
|
||||
int /* featureID - used to index the state in the previous hypotheses */,
|
||||
@ -174,42 +179,42 @@ public:
|
||||
};
|
||||
|
||||
// Python-like generator that yields next nonterminal leaf on every call
|
||||
$generator(leafNT) {
|
||||
std::vector<TreePointer>::iterator it;
|
||||
InternalTree* tree;
|
||||
leafNT(InternalTree* root = 0): tree(root) {}
|
||||
$emit(std::vector<TreePointer>::iterator)
|
||||
for (it = tree->GetChildren().begin(); it !=tree->GetChildren().end(); ++it) {
|
||||
if (!(*it)->IsTerminal() && (*it)->GetLength() == 0) {
|
||||
$yield(it);
|
||||
}
|
||||
else if ((*it)->GetLength() > 0) {
|
||||
if (&(**it)) { // normal pointer to same object that TreePointer points to
|
||||
$restart(tree = &(**it));
|
||||
}
|
||||
}
|
||||
$generator(leafNT)
|
||||
{
|
||||
std::vector<TreePointer>::iterator it;
|
||||
InternalTree* tree;
|
||||
leafNT(InternalTree* root = 0): tree(root) {}
|
||||
$emit(std::vector<TreePointer>::iterator)
|
||||
for (it = tree->GetChildren().begin(); it !=tree->GetChildren().end(); ++it) {
|
||||
if (!(*it)->IsTerminal() && (*it)->GetLength() == 0) {
|
||||
$yield(it);
|
||||
} else if ((*it)->GetLength() > 0) {
|
||||
if (&(**it)) { // normal pointer to same object that TreePointer points to
|
||||
$restart(tree = &(**it));
|
||||
}
|
||||
}
|
||||
$stop;
|
||||
}
|
||||
$stop;
|
||||
};
|
||||
|
||||
|
||||
// Python-like generator that yields the parent of the next nonterminal leaf on every call
|
||||
$generator(leafNTParent) {
|
||||
std::vector<TreePointer>::iterator it;
|
||||
InternalTree* tree;
|
||||
leafNTParent(InternalTree* root = 0): tree(root) {}
|
||||
$emit(InternalTree*)
|
||||
for (it = tree->GetChildren().begin(); it !=tree->GetChildren().end(); ++it) {
|
||||
if (!(*it)->IsTerminal() && (*it)->GetLength() == 0) {
|
||||
$yield(tree);
|
||||
}
|
||||
else if ((*it)->GetLength() > 0) {
|
||||
if (&(**it)) { // normal pointer to same object that TreePointer points to
|
||||
$restart(tree = &(**it));
|
||||
}
|
||||
}
|
||||
$generator(leafNTParent)
|
||||
{
|
||||
std::vector<TreePointer>::iterator it;
|
||||
InternalTree* tree;
|
||||
leafNTParent(InternalTree* root = 0): tree(root) {}
|
||||
$emit(InternalTree*)
|
||||
for (it = tree->GetChildren().begin(); it !=tree->GetChildren().end(); ++it) {
|
||||
if (!(*it)->IsTerminal() && (*it)->GetLength() == 0) {
|
||||
$yield(tree);
|
||||
} else if ((*it)->GetLength() > 0) {
|
||||
if (&(**it)) { // normal pointer to same object that TreePointer points to
|
||||
$restart(tree = &(**it));
|
||||
}
|
||||
}
|
||||
$stop;
|
||||
}
|
||||
$stop;
|
||||
};
|
||||
|
||||
|
||||
|
@ -59,8 +59,7 @@ const Factor *FactorCollection::AddFactor(const StringPiece &factorString, bool
|
||||
if (isNonTerminal) {
|
||||
m_factorIdNonTerminal++;
|
||||
UTIL_THROW_IF2(m_factorIdNonTerminal >= moses_MaxNumNonterminals, "Number of non-terminals exceeds maximum size reserved. Adjust parameter moses_MaxNumNonterminals, then recompile");
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
m_factorId++;
|
||||
}
|
||||
}
|
||||
|
@ -182,9 +182,9 @@ template <class Model> void Fill<Model>::AddPhraseOOV(TargetPhrase &phrase, std:
|
||||
// for pruning
|
||||
template <class Model> float Fill<Model>::GetBestScore(const ChartCellLabel *chartCell) const
|
||||
{
|
||||
search::PartialVertex vertex = chartCell->GetStack().incr->RootAlternate();
|
||||
UTIL_THROW_IF2(vertex.Empty(), "hypothesis with empty stack");
|
||||
return vertex.Bound();
|
||||
search::PartialVertex vertex = chartCell->GetStack().incr->RootAlternate();
|
||||
UTIL_THROW_IF2(vertex.Empty(), "hypothesis with empty stack");
|
||||
return vertex.Bound();
|
||||
}
|
||||
|
||||
// TODO: factors (but chart doesn't seem to support factors anyway).
|
||||
|
@ -33,14 +33,14 @@ InputPath(const Phrase &phrase, const NonTerminalSet &sourceNonTerms,
|
||||
|
||||
InputPath::~InputPath()
|
||||
{
|
||||
// Since there is no way for the Phrase Dictionaries to tell in
|
||||
// which (sentence) context phrases were looked up, we tell them
|
||||
// Since there is no way for the Phrase Dictionaries to tell in
|
||||
// which (sentence) context phrases were looked up, we tell them
|
||||
// now that the phrase isn't needed any more by this inputPath
|
||||
typedef std::pair<const TargetPhraseCollection*, const void* > entry;
|
||||
std::map<const PhraseDictionary*, entry>::const_iterator iter;
|
||||
for (iter = m_targetPhrases.begin(); iter != m_targetPhrases.end(); ++iter)
|
||||
iter->first->Release(iter->second.first);
|
||||
|
||||
|
||||
delete m_inputScore;
|
||||
}
|
||||
|
||||
|
@ -59,22 +59,22 @@ public:
|
||||
delete state;
|
||||
}
|
||||
|
||||
void reset(const DALMState &from){
|
||||
delete state;
|
||||
state = new DALM::State(*from.state);
|
||||
}
|
||||
void reset(const DALMState &from) {
|
||||
delete state;
|
||||
state = new DALM::State(*from.state);
|
||||
}
|
||||
|
||||
void reset(DALM::State *s){
|
||||
delete state;
|
||||
state = s;
|
||||
}
|
||||
void reset(DALM::State *s) {
|
||||
delete state;
|
||||
state = s;
|
||||
}
|
||||
|
||||
virtual int Compare(const FFState& other) const{
|
||||
const DALMState &o = static_cast<const DALMState &>(other);
|
||||
if(state->get_count() < o.state->get_count()) return -1;
|
||||
else if(state->get_count() > o.state->get_count()) return 1;
|
||||
else return state->compare(o.state);
|
||||
}
|
||||
virtual int Compare(const FFState& other) const {
|
||||
const DALMState &o = static_cast<const DALMState &>(other);
|
||||
if(state->get_count() < o.state->get_count()) return -1;
|
||||
else if(state->get_count() > o.state->get_count()) return 1;
|
||||
else return state->compare(o.state);
|
||||
}
|
||||
|
||||
DALM::State *get_state() const {
|
||||
return state;
|
||||
@ -88,78 +88,78 @@ public:
|
||||
class DALMChartState : public FFState
|
||||
{
|
||||
private:
|
||||
const ChartHypothesis &hypo;
|
||||
DALM::Fragment *prefixFragments;
|
||||
unsigned short prefixLength;
|
||||
float prefixScore;
|
||||
DALMState *rightContext;
|
||||
bool isLarge;
|
||||
const ChartHypothesis &hypo;
|
||||
DALM::Fragment *prefixFragments;
|
||||
unsigned short prefixLength;
|
||||
float prefixScore;
|
||||
DALMState *rightContext;
|
||||
bool isLarge;
|
||||
|
||||
public:
|
||||
DALMChartState(
|
||||
const ChartHypothesis &hypo,
|
||||
DALM::Fragment *prefixFragments,
|
||||
unsigned short prefixLength,
|
||||
float prefixScore,
|
||||
DALMState *rightContext,
|
||||
bool isLarge)
|
||||
: hypo(hypo),
|
||||
prefixFragments(prefixFragments),
|
||||
prefixLength(prefixLength),
|
||||
prefixScore(prefixScore),
|
||||
rightContext(rightContext),
|
||||
isLarge(isLarge)
|
||||
{}
|
||||
DALMChartState(
|
||||
const ChartHypothesis &hypo,
|
||||
DALM::Fragment *prefixFragments,
|
||||
unsigned short prefixLength,
|
||||
float prefixScore,
|
||||
DALMState *rightContext,
|
||||
bool isLarge)
|
||||
: hypo(hypo),
|
||||
prefixFragments(prefixFragments),
|
||||
prefixLength(prefixLength),
|
||||
prefixScore(prefixScore),
|
||||
rightContext(rightContext),
|
||||
isLarge(isLarge)
|
||||
{}
|
||||
|
||||
virtual ~DALMChartState(){
|
||||
delete [] prefixFragments;
|
||||
delete rightContext;
|
||||
}
|
||||
virtual ~DALMChartState() {
|
||||
delete [] prefixFragments;
|
||||
delete rightContext;
|
||||
}
|
||||
|
||||
unsigned short GetPrefixLength() const{
|
||||
return prefixLength;
|
||||
}
|
||||
unsigned short GetPrefixLength() const {
|
||||
return prefixLength;
|
||||
}
|
||||
|
||||
const DALM::Fragment *GetPrefixFragments() const{
|
||||
return prefixFragments;
|
||||
}
|
||||
const DALM::Fragment *GetPrefixFragments() const {
|
||||
return prefixFragments;
|
||||
}
|
||||
|
||||
float GetPrefixScore() const{
|
||||
return prefixScore;
|
||||
}
|
||||
float GetPrefixScore() const {
|
||||
return prefixScore;
|
||||
}
|
||||
|
||||
const DALMState *GetRightContext() const{
|
||||
return rightContext;
|
||||
}
|
||||
const DALMState *GetRightContext() const {
|
||||
return rightContext;
|
||||
}
|
||||
|
||||
bool LargeEnough() const{
|
||||
return isLarge;
|
||||
}
|
||||
bool LargeEnough() const {
|
||||
return isLarge;
|
||||
}
|
||||
|
||||
virtual int Compare(const FFState& other) const{
|
||||
const DALMChartState &o = static_cast<const DALMChartState &>(other);
|
||||
// prefix
|
||||
virtual int Compare(const FFState& other) const {
|
||||
const DALMChartState &o = static_cast<const DALMChartState &>(other);
|
||||
// prefix
|
||||
if (hypo.GetCurrSourceRange().GetStartPos() > 0) { // not for "<s> ..."
|
||||
if (prefixLength != o.prefixLength){
|
||||
return (prefixLength < o.prefixLength)?-1:1;
|
||||
} else {
|
||||
if(prefixLength > 0){
|
||||
DALM::Fragment &f = prefixFragments[prefixLength-1];
|
||||
DALM::Fragment &of = o.prefixFragments[prefixLength-1];
|
||||
int ret = DALM::compare_fragments(f, of);
|
||||
if(ret != 0) return ret;
|
||||
}
|
||||
}
|
||||
if (prefixLength != o.prefixLength) {
|
||||
return (prefixLength < o.prefixLength)?-1:1;
|
||||
} else {
|
||||
if(prefixLength > 0) {
|
||||
DALM::Fragment &f = prefixFragments[prefixLength-1];
|
||||
DALM::Fragment &of = o.prefixFragments[prefixLength-1];
|
||||
int ret = DALM::compare_fragments(f, of);
|
||||
if(ret != 0) return ret;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// suffix
|
||||
size_t inputSize = hypo.GetManager().GetSource().GetSize();
|
||||
size_t inputSize = hypo.GetManager().GetSource().GetSize();
|
||||
if (hypo.GetCurrSourceRange().GetEndPos() < inputSize - 1) { // not for "... </s>"
|
||||
int ret = o.rightContext->Compare(*rightContext);
|
||||
int ret = o.rightContext->Compare(*rightContext);
|
||||
if (ret != 0) return ret;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
};
|
||||
|
||||
LanguageModelDALM::LanguageModelDALM(const std::string &line)
|
||||
@ -181,18 +181,18 @@ LanguageModelDALM::~LanguageModelDALM()
|
||||
|
||||
void LanguageModelDALM::Load()
|
||||
{
|
||||
/////////////////////
|
||||
// READING INIFILE //
|
||||
/////////////////////
|
||||
string inifile= m_filePath + "/dalm.ini";
|
||||
/////////////////////
|
||||
// READING INIFILE //
|
||||
/////////////////////
|
||||
string inifile= m_filePath + "/dalm.ini";
|
||||
|
||||
UTIL_THROW_IF(model.empty() || words.empty() || wordstxt.empty(),
|
||||
util::FileOpenException,
|
||||
"Failed to read DALM ini file " << m_filePath << ". Probably doesn't exist");
|
||||
|
||||
model = m_filePath + "/" + model;
|
||||
words = m_filePath + "/" + words;
|
||||
wordstxt = m_filePath + "/" + wordstxt;
|
||||
model = m_filePath + "/" + model;
|
||||
words = m_filePath + "/" + words;
|
||||
wordstxt = m_filePath + "/" + wordstxt;
|
||||
|
||||
// Preparing a logger object.
|
||||
m_logger = new DALM::Logger(stderr);
|
||||
@ -233,14 +233,14 @@ void LanguageModelDALM::CalcScore(const Phrase &phrase, float &fullScore, float
|
||||
size_t currPos = 0;
|
||||
size_t hist_count = 0;
|
||||
DALMState *dalm_state = new DALMState(m_nGramOrder);
|
||||
DALM::State *state = dalm_state->get_state();
|
||||
DALM::State *state = dalm_state->get_state();
|
||||
|
||||
if(phrase.GetWord(0).GetFactor(m_factorType) == m_beginSentenceFactor) {
|
||||
m_lm->init_state(*state);
|
||||
currPos++;
|
||||
hist_count++;
|
||||
}
|
||||
|
||||
if(phrase.GetWord(0).GetFactor(m_factorType) == m_beginSentenceFactor){
|
||||
m_lm->init_state(*state);
|
||||
currPos++;
|
||||
hist_count++;
|
||||
}
|
||||
|
||||
while (currPos < phraseSize) {
|
||||
const Word &word = phrase.GetWord(currPos);
|
||||
hist_count++;
|
||||
@ -249,9 +249,9 @@ void LanguageModelDALM::CalcScore(const Phrase &phrase, float &fullScore, float
|
||||
state->refresh();
|
||||
hist_count = 0;
|
||||
} else {
|
||||
DALM::VocabId wid = GetVocabId(word.GetFactor(m_factorType));
|
||||
float score = m_lm->query(wid, *state);
|
||||
fullScore += score;
|
||||
DALM::VocabId wid = GetVocabId(word.GetFactor(m_factorType));
|
||||
float score = m_lm->query(wid, *state);
|
||||
fullScore += score;
|
||||
if (hist_count >= m_nGramOrder) ngramScore += score;
|
||||
if (wid==m_vocab->unk()) ++oovCount;
|
||||
}
|
||||
@ -259,9 +259,9 @@ void LanguageModelDALM::CalcScore(const Phrase &phrase, float &fullScore, float
|
||||
currPos++;
|
||||
}
|
||||
|
||||
fullScore = TransformLMScore(fullScore);
|
||||
ngramScore = TransformLMScore(ngramScore);
|
||||
delete dalm_state;
|
||||
fullScore = TransformLMScore(fullScore);
|
||||
ngramScore = TransformLMScore(ngramScore);
|
||||
delete dalm_state;
|
||||
}
|
||||
|
||||
FFState *LanguageModelDALM::Evaluate(const Hypothesis &hypo, const FFState *ps, ScoreComponentCollection *out) const
|
||||
@ -283,11 +283,11 @@ FFState *LanguageModelDALM::Evaluate(const Hypothesis &hypo, const FFState *ps,
|
||||
const std::size_t adjust_end = std::min(end, begin + m_nGramOrder - 1);
|
||||
|
||||
DALMState *dalm_state = new DALMState(*dalm_ps);
|
||||
DALM::State *state = dalm_state->get_state();
|
||||
|
||||
DALM::State *state = dalm_state->get_state();
|
||||
|
||||
float score = 0.0;
|
||||
for(std::size_t position=begin; position < adjust_end; position++){
|
||||
score += m_lm->query(GetVocabId(hypo.GetWord(position).GetFactor(m_factorType)), *state);
|
||||
for(std::size_t position=begin; position < adjust_end; position++) {
|
||||
score += m_lm->query(GetVocabId(hypo.GetWord(position).GetFactor(m_factorType)), *state);
|
||||
}
|
||||
|
||||
if (hypo.IsSourceCompleted()) {
|
||||
@ -295,8 +295,8 @@ FFState *LanguageModelDALM::Evaluate(const Hypothesis &hypo, const FFState *ps,
|
||||
std::vector<DALM::VocabId> indices(m_nGramOrder-1);
|
||||
const DALM::VocabId *last = LastIDs(hypo, &indices.front());
|
||||
m_lm->set_state(&indices.front(), (last-&indices.front()), *state);
|
||||
|
||||
score += m_lm->query(wid_end, *state);
|
||||
|
||||
score += m_lm->query(wid_end, *state);
|
||||
} else if (adjust_end < end) {
|
||||
// Get state after adding a long phrase.
|
||||
std::vector<DALM::VocabId> indices(m_nGramOrder-1);
|
||||
@ -304,7 +304,7 @@ FFState *LanguageModelDALM::Evaluate(const Hypothesis &hypo, const FFState *ps,
|
||||
m_lm->set_state(&indices.front(), (last-&indices.front()), *state);
|
||||
}
|
||||
|
||||
score = TransformLMScore(score);
|
||||
score = TransformLMScore(score);
|
||||
if (OOVFeatureEnabled()) {
|
||||
std::vector<float> scores(2);
|
||||
scores[0] = score;
|
||||
@ -317,73 +317,74 @@ FFState *LanguageModelDALM::Evaluate(const Hypothesis &hypo, const FFState *ps,
|
||||
return dalm_state;
|
||||
}
|
||||
|
||||
FFState *LanguageModelDALM::EvaluateChart(const ChartHypothesis& hypo, int featureID, ScoreComponentCollection *out) const{
|
||||
FFState *LanguageModelDALM::EvaluateChart(const ChartHypothesis& hypo, int featureID, ScoreComponentCollection *out) const
|
||||
{
|
||||
// initialize language model context state
|
||||
DALMState *dalm_state = new DALMState(m_nGramOrder);
|
||||
DALM::State *state = dalm_state->get_state();
|
||||
DALMState *dalm_state = new DALMState(m_nGramOrder);
|
||||
DALM::State *state = dalm_state->get_state();
|
||||
|
||||
size_t contextSize = m_nGramOrder-1;
|
||||
DALM::Fragment *prefixFragments = new DALM::Fragment[contextSize];
|
||||
unsigned short prefixLength = 0;
|
||||
bool isLarge = false;
|
||||
size_t contextSize = m_nGramOrder-1;
|
||||
DALM::Fragment *prefixFragments = new DALM::Fragment[contextSize];
|
||||
unsigned short prefixLength = 0;
|
||||
bool isLarge = false;
|
||||
|
||||
// initial language model scores
|
||||
float prefixScore = 0.0; // not yet final for initial words (lack context)
|
||||
float hypoScore = 0.0; // total hypothesis score.
|
||||
|
||||
const TargetPhrase &targetPhrase = hypo.GetCurrTargetPhrase();
|
||||
size_t hypoSize = targetPhrase.GetSize();
|
||||
const TargetPhrase &targetPhrase = hypo.GetCurrTargetPhrase();
|
||||
size_t hypoSize = targetPhrase.GetSize();
|
||||
|
||||
// get index map for underlying hypotheses
|
||||
const AlignmentInfo::NonTermIndexMap &nonTermIndexMap =
|
||||
targetPhrase.GetAlignNonTerm().GetNonTermIndexMap();
|
||||
|
||||
size_t phrasePos = 0;
|
||||
|
||||
// begginig of sentence.
|
||||
if(hypoSize > 0){
|
||||
const Word &word = targetPhrase.GetWord(0);
|
||||
if(!word.IsNonTerminal()){
|
||||
DALM::VocabId wid = GetVocabId(word.GetFactor(m_factorType));
|
||||
if(word.GetFactor(m_factorType) == m_beginSentenceFactor){
|
||||
m_lm->init_state(*state);
|
||||
// state is finalized.
|
||||
isLarge = true;
|
||||
}else{
|
||||
if(isLarge){
|
||||
float score = m_lm->query(wid, *state);
|
||||
hypoScore += score;
|
||||
}else{
|
||||
float score = m_lm->query(wid, *state, prefixFragments[prefixLength]);
|
||||
size_t phrasePos = 0;
|
||||
|
||||
prefixScore += score;
|
||||
hypoScore += score;
|
||||
prefixLength++;
|
||||
if(prefixLength >= contextSize) isLarge = true;
|
||||
}
|
||||
}
|
||||
}else{
|
||||
// begginig of sentence.
|
||||
if(hypoSize > 0) {
|
||||
const Word &word = targetPhrase.GetWord(0);
|
||||
if(!word.IsNonTerminal()) {
|
||||
DALM::VocabId wid = GetVocabId(word.GetFactor(m_factorType));
|
||||
if(word.GetFactor(m_factorType) == m_beginSentenceFactor) {
|
||||
m_lm->init_state(*state);
|
||||
// state is finalized.
|
||||
isLarge = true;
|
||||
} else {
|
||||
if(isLarge) {
|
||||
float score = m_lm->query(wid, *state);
|
||||
hypoScore += score;
|
||||
} else {
|
||||
float score = m_lm->query(wid, *state, prefixFragments[prefixLength]);
|
||||
|
||||
prefixScore += score;
|
||||
hypoScore += score;
|
||||
prefixLength++;
|
||||
if(prefixLength >= contextSize) isLarge = true;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// special case: rule starts with non-terminal -> copy everything
|
||||
size_t nonTermIndex = nonTermIndexMap[0];
|
||||
const ChartHypothesis *prevHypo = hypo.GetPrevHypo(nonTermIndex);
|
||||
|
||||
const DALMChartState* prevState =
|
||||
static_cast<const DALMChartState*>(prevHypo->GetFFState(featureID));
|
||||
|
||||
|
||||
// get prefixScore and hypoScore
|
||||
prefixScore = prevState->GetPrefixScore();
|
||||
hypoScore = UntransformLMScore(prevHypo->GetScoreBreakdown().GetScoresForProducer(this)[0]);
|
||||
hypoScore = UntransformLMScore(prevHypo->GetScoreBreakdown().GetScoresForProducer(this)[0]);
|
||||
|
||||
// get language model state
|
||||
dalm_state->reset(*prevState->GetRightContext());
|
||||
state = dalm_state->get_state();
|
||||
dalm_state->reset(*prevState->GetRightContext());
|
||||
state = dalm_state->get_state();
|
||||
|
||||
prefixLength = prevState->GetPrefixLength();
|
||||
const DALM::Fragment *prevPrefixFragments = prevState->GetPrefixFragments();
|
||||
std::memcpy(prefixFragments, prevPrefixFragments, sizeof(DALM::Fragment)*prefixLength);
|
||||
isLarge = prevState->LargeEnough();
|
||||
}
|
||||
phrasePos++;
|
||||
prefixLength = prevState->GetPrefixLength();
|
||||
const DALM::Fragment *prevPrefixFragments = prevState->GetPrefixFragments();
|
||||
std::memcpy(prefixFragments, prevPrefixFragments, sizeof(DALM::Fragment)*prefixLength);
|
||||
isLarge = prevState->LargeEnough();
|
||||
}
|
||||
phrasePos++;
|
||||
}
|
||||
|
||||
// loop over rule
|
||||
@ -393,16 +394,16 @@ FFState *LanguageModelDALM::EvaluateChart(const ChartHypothesis& hypo, int featu
|
||||
|
||||
// regular word
|
||||
if (!word.IsNonTerminal()) {
|
||||
DALM::VocabId wid = GetVocabId(word.GetFactor(m_factorType));
|
||||
if (isLarge) {
|
||||
hypoScore += m_lm->query(wid, *state);
|
||||
}else{
|
||||
float score = m_lm->query(wid, *state, prefixFragments[prefixLength]);
|
||||
prefixScore += score;
|
||||
hypoScore += score;
|
||||
prefixLength++;
|
||||
if(prefixLength >= contextSize) isLarge = true;
|
||||
}
|
||||
DALM::VocabId wid = GetVocabId(word.GetFactor(m_factorType));
|
||||
if (isLarge) {
|
||||
hypoScore += m_lm->query(wid, *state);
|
||||
} else {
|
||||
float score = m_lm->query(wid, *state, prefixFragments[prefixLength]);
|
||||
prefixScore += score;
|
||||
hypoScore += score;
|
||||
prefixLength++;
|
||||
if(prefixLength >= contextSize) isLarge = true;
|
||||
}
|
||||
}
|
||||
|
||||
// non-terminal, add phrase from underlying hypothesis
|
||||
@ -414,40 +415,40 @@ FFState *LanguageModelDALM::EvaluateChart(const ChartHypothesis& hypo, int featu
|
||||
|
||||
const DALMChartState* prevState =
|
||||
static_cast<const DALMChartState*>(prevHypo->GetFFState(featureID));
|
||||
|
||||
|
||||
size_t prevPrefixLength = prevState->GetPrefixLength();
|
||||
const DALM::Fragment *prevPrefixFragments = prevState->GetPrefixFragments();
|
||||
DALM::Gap gap(*state);
|
||||
const DALM::Fragment *prevPrefixFragments = prevState->GetPrefixFragments();
|
||||
DALM::Gap gap(*state);
|
||||
// score its prefix
|
||||
for(size_t prefixPos = 0; prefixPos < prevPrefixLength; prefixPos++) {
|
||||
const DALM::Fragment &f = prevPrefixFragments[prefixPos];
|
||||
const DALM::Fragment &f = prevPrefixFragments[prefixPos];
|
||||
|
||||
if (isLarge) {
|
||||
hypoScore += m_lm->query(f, *state, gap);
|
||||
} else {
|
||||
float score = m_lm->query(f, *state, gap, prefixFragments[prefixLength]);
|
||||
prefixScore += score;
|
||||
hypoScore += score;
|
||||
prefixLength++;
|
||||
if(prefixLength >= contextSize) isLarge = true;
|
||||
}
|
||||
gap.succ();
|
||||
if (isLarge) {
|
||||
hypoScore += m_lm->query(f, *state, gap);
|
||||
} else {
|
||||
float score = m_lm->query(f, *state, gap, prefixFragments[prefixLength]);
|
||||
prefixScore += score;
|
||||
hypoScore += score;
|
||||
prefixLength++;
|
||||
if(prefixLength >= contextSize) isLarge = true;
|
||||
}
|
||||
gap.succ();
|
||||
}
|
||||
|
||||
// check if we are dealing with a large sub-phrase
|
||||
if (prevState->LargeEnough()) {
|
||||
// add its language model score
|
||||
hypoScore += UntransformLMScore(prevHypo->GetScoreBreakdown().GetScoresForProducer(this)[0]);
|
||||
hypoScore += UntransformLMScore(prevHypo->GetScoreBreakdown().GetScoresForProducer(this)[0]);
|
||||
hypoScore -= prevState->GetPrefixScore(); // remove overwrapped score.
|
||||
// copy language model state
|
||||
dalm_state->reset(*prevState->GetRightContext());
|
||||
state = dalm_state->get_state();
|
||||
// copy language model state
|
||||
dalm_state->reset(*prevState->GetRightContext());
|
||||
state = dalm_state->get_state();
|
||||
} else {
|
||||
DALM::State *state_new = new DALM::State(*prevState->GetRightContext()->get_state());
|
||||
m_lm->set_state(*state_new, *state, gap);
|
||||
dalm_state->reset(state_new);
|
||||
state = dalm_state->get_state();
|
||||
}
|
||||
DALM::State *state_new = new DALM::State(*prevState->GetRightContext()->get_state());
|
||||
m_lm->set_state(*state_new, *state, gap);
|
||||
dalm_state->reset(state_new);
|
||||
state = dalm_state->get_state();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -466,36 +467,36 @@ void LanguageModelDALM::CreateVocabMapping(const std::string &wordstxt)
|
||||
{
|
||||
InputFileStream vocabStrm(wordstxt);
|
||||
|
||||
std::vector< std::pair<std::size_t, DALM::VocabId> > vlist;
|
||||
std::vector< std::pair<std::size_t, DALM::VocabId> > vlist;
|
||||
string line;
|
||||
std::size_t max_fid = 0;
|
||||
std::size_t max_fid = 0;
|
||||
while(getline(vocabStrm, line)) {
|
||||
const Factor *factor = FactorCollection::Instance().AddFactor(line);
|
||||
std::size_t fid = factor->GetId();
|
||||
DALM::VocabId wid = m_vocab->lookup(line.c_str());
|
||||
const Factor *factor = FactorCollection::Instance().AddFactor(line);
|
||||
std::size_t fid = factor->GetId();
|
||||
DALM::VocabId wid = m_vocab->lookup(line.c_str());
|
||||
|
||||
vlist.push_back(std::pair<std::size_t, DALM::VocabId>(fid, wid));
|
||||
if(max_fid < fid) max_fid = fid;
|
||||
vlist.push_back(std::pair<std::size_t, DALM::VocabId>(fid, wid));
|
||||
if(max_fid < fid) max_fid = fid;
|
||||
}
|
||||
|
||||
for(std::size_t i = 0; i < m_vocabMap.size(); i++){
|
||||
m_vocabMap[i] = m_vocab->unk();
|
||||
}
|
||||
for(std::size_t i = 0; i < m_vocabMap.size(); i++) {
|
||||
m_vocabMap[i] = m_vocab->unk();
|
||||
}
|
||||
|
||||
m_vocabMap.resize(max_fid+1, m_vocab->unk());
|
||||
std::vector< std::pair<std::size_t, DALM::VocabId> >::iterator it = vlist.begin();
|
||||
while(it != vlist.end()){
|
||||
std::pair<std::size_t, DALM::VocabId> &entry = *it;
|
||||
m_vocabMap[entry.first] = entry.second;
|
||||
m_vocabMap.resize(max_fid+1, m_vocab->unk());
|
||||
std::vector< std::pair<std::size_t, DALM::VocabId> >::iterator it = vlist.begin();
|
||||
while(it != vlist.end()) {
|
||||
std::pair<std::size_t, DALM::VocabId> &entry = *it;
|
||||
m_vocabMap[entry.first] = entry.second;
|
||||
|
||||
++it;
|
||||
}
|
||||
++it;
|
||||
}
|
||||
}
|
||||
|
||||
DALM::VocabId LanguageModelDALM::GetVocabId(const Factor *factor) const
|
||||
{
|
||||
std::size_t fid = factor->GetId();
|
||||
return (m_vocabMap.size() > fid)? m_vocabMap[fid] : m_vocab->unk();
|
||||
std::size_t fid = factor->GetId();
|
||||
return (m_vocabMap.size() > fid)? m_vocabMap[fid] : m_vocab->unk();
|
||||
}
|
||||
|
||||
void LanguageModelDALM::SetParameter(const std::string& key, const std::string& value)
|
||||
|
@ -182,11 +182,11 @@ void Manager::printDivergentHypothesis(long translationId, const Hypothesis* hyp
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
void
|
||||
Manager::
|
||||
printThisHypothesis(long translationId, const Hypothesis* hypo,
|
||||
const vector <const TargetPhrase*> & remainingPhrases,
|
||||
float remainingScore, ostream& outputStream) const
|
||||
printThisHypothesis(long translationId, const Hypothesis* hypo,
|
||||
const vector <const TargetPhrase*> & remainingPhrases,
|
||||
float remainingScore, ostream& outputStream) const
|
||||
{
|
||||
|
||||
outputStream << translationId << " ||| ";
|
||||
|
@ -140,23 +140,23 @@ public:
|
||||
std::pair<MapSrc2Tgt::iterator,bool> piter;
|
||||
if(useCache) {
|
||||
piter=m_cache.insert(std::make_pair(src,static_cast<TargetPhraseCollectionWithSourcePhrase const*>(0)));
|
||||
if(!piter.second){
|
||||
if (piter.first->second){
|
||||
VERBOSE(1,"PDTAimp::GetTargetPhraseCollection: piter.first->second->GetSize():" << (piter.first->second)->GetSize() << std::endl);
|
||||
}else{
|
||||
VERBOSE(1,"PDTAimp::GetTargetPhraseCollection: piter.first->second->GetSize():" << 0 << std::endl);
|
||||
}
|
||||
if(!piter.second) {
|
||||
if (piter.first->second) {
|
||||
VERBOSE(1,"PDTAimp::GetTargetPhraseCollection: piter.first->second->GetSize():" << (piter.first->second)->GetSize() << std::endl);
|
||||
} else {
|
||||
VERBOSE(1,"PDTAimp::GetTargetPhraseCollection: piter.first->second->GetSize():" << 0 << std::endl);
|
||||
}
|
||||
return piter.first->second;
|
||||
}
|
||||
} else if (m_cache.size()) {
|
||||
MapSrc2Tgt::const_iterator i=m_cache.find(src);
|
||||
if (i!=m_cache.end()){
|
||||
if (i->second){
|
||||
VERBOSE(1,"PDTAimp::GetTargetPhraseCollection: i->second->GetSize():" << (void*) (i->second) << std::endl);
|
||||
}else{
|
||||
if (i!=m_cache.end()) {
|
||||
if (i->second) {
|
||||
VERBOSE(1,"PDTAimp::GetTargetPhraseCollection: i->second->GetSize():" << (void*) (i->second) << std::endl);
|
||||
} else {
|
||||
VERBOSE(1,"PDTAimp::GetTargetPhraseCollection: i->second->GetSize():" << 0 << std::endl);
|
||||
}
|
||||
}else{
|
||||
} else {
|
||||
VERBOSE(1,"PDTAimp::GetTargetPhraseCollection: i->second->GetSize():" << 0 << std::endl);
|
||||
}
|
||||
return (i!=m_cache.end() ? i->second : 0);
|
||||
|
@ -117,7 +117,7 @@ public:
|
||||
|
||||
std::vector<float> GetWeights(const std::string &name);
|
||||
std::map<std::string, std::vector<float> > GetAllWeights() const {
|
||||
return m_weights;
|
||||
return m_weights;
|
||||
}
|
||||
std::set<std::string> GetWeightNames() const;
|
||||
|
||||
|
@ -381,7 +381,7 @@ void Phrase::InitStartEndWord()
|
||||
size_t Phrase::Find(const Phrase &sought, int maxUnknown) const
|
||||
{
|
||||
if (GetSize() < sought.GetSize()) {
|
||||
// sought phrase too big
|
||||
// sought phrase too big
|
||||
return NOT_FOUND;
|
||||
}
|
||||
|
||||
|
@ -65,7 +65,8 @@ void Candidates::readBin(FILE* f)
|
||||
const LabelId PrefixTreeMap::MagicWord = std::numeric_limits<LabelId>::max() - 1;
|
||||
|
||||
//////////////////////////////////////////////////////////////////
|
||||
PrefixTreeMap::~PrefixTreeMap() {
|
||||
PrefixTreeMap::~PrefixTreeMap()
|
||||
{
|
||||
if(m_FileSrc) {
|
||||
fClose(m_FileSrc);
|
||||
}
|
||||
@ -99,8 +100,7 @@ WordVoc &ReadVoc(std::map<std::string,WordVoc> &vocs, const std::string& filenam
|
||||
WordVoc &voc = vocs[filename];
|
||||
voc.Read(filename);
|
||||
return voc;
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
return vi->second;
|
||||
}
|
||||
}
|
||||
|
@ -251,8 +251,8 @@ bool SearchCubePruning::CheckDistortion(const WordsBitmap &hypoBitmap, const Wor
|
||||
}
|
||||
|
||||
if (StaticData::Instance().AdjacentOnly() &&
|
||||
!hypoBitmap.IsAdjacent(range.GetStartPos(), range.GetEndPos())) {
|
||||
return false;
|
||||
!hypoBitmap.IsAdjacent(range.GetStartPos(), range.GetEndPos())) {
|
||||
return false;
|
||||
}
|
||||
|
||||
bool leftMostEdge = (hypoFirstGapPos == startPos);
|
||||
|
@ -254,8 +254,8 @@ void SearchNormal::ExpandAllHypotheses(const Hypothesis &hypothesis, size_t star
|
||||
}
|
||||
|
||||
if (StaticData::Instance().AdjacentOnly() &&
|
||||
!hypothesis.GetWordsBitmap().IsAdjacent(startPos, endPos)) {
|
||||
return;
|
||||
!hypothesis.GetWordsBitmap().IsAdjacent(startPos, endPos)) {
|
||||
return;
|
||||
}
|
||||
|
||||
// loop through all translation options
|
||||
|
@ -130,30 +130,30 @@ int Sentence::Read(std::istream& in,const std::vector<FactorType>& factorOrder)
|
||||
|
||||
std::vector< std::map<std::string, std::string> > dlt_meta = ProcessAndStripDLT(line);
|
||||
|
||||
PhraseDictionaryDynamicCacheBased* cbtm = NULL;
|
||||
DynamicCacheBasedLanguageModel* cblm = NULL;
|
||||
PhraseDictionaryDynamicCacheBased* cbtm = NULL;
|
||||
DynamicCacheBasedLanguageModel* cblm = NULL;
|
||||
std::vector< std::map<std::string, std::string> >::iterator dlt_meta_it = dlt_meta.begin();
|
||||
for (dlt_meta_it = dlt_meta.begin(); dlt_meta_it != dlt_meta.end(); ++dlt_meta_it) {
|
||||
|
||||
if ((*dlt_meta_it).find("type") != (*dlt_meta_it).end()) {
|
||||
if ((*dlt_meta_it)["type"] == "cbtm") {
|
||||
std::string id = "default";
|
||||
if ((*dlt_meta_it).find("id") != (*dlt_meta_it).end()) {
|
||||
id = (*dlt_meta_it)["id"];
|
||||
}
|
||||
cbtm = &PhraseDictionaryDynamicCacheBased::InstanceNonConst(id);
|
||||
if (cbtm) cbtm->ExecuteDlt(*dlt_meta_it);
|
||||
}
|
||||
if ((*dlt_meta_it)["type"] == "cblm") {
|
||||
std::string id = "default";
|
||||
if ((*dlt_meta_it).find("id") != (*dlt_meta_it).end()) {
|
||||
id = (*dlt_meta_it)["id"];
|
||||
}
|
||||
cblm = &DynamicCacheBasedLanguageModel::InstanceNonConst(id);
|
||||
if (cblm) cblm->ExecuteDlt(*dlt_meta_it);
|
||||
}
|
||||
|
||||
if ((*dlt_meta_it).find("type") != (*dlt_meta_it).end()) {
|
||||
if ((*dlt_meta_it)["type"] == "cbtm") {
|
||||
std::string id = "default";
|
||||
if ((*dlt_meta_it).find("id") != (*dlt_meta_it).end()) {
|
||||
id = (*dlt_meta_it)["id"];
|
||||
}
|
||||
cbtm = &PhraseDictionaryDynamicCacheBased::InstanceNonConst(id);
|
||||
if (cbtm) cbtm->ExecuteDlt(*dlt_meta_it);
|
||||
}
|
||||
if ((*dlt_meta_it)["type"] == "cblm") {
|
||||
std::string id = "default";
|
||||
if ((*dlt_meta_it).find("id") != (*dlt_meta_it).end()) {
|
||||
id = (*dlt_meta_it)["id"];
|
||||
}
|
||||
cblm = &DynamicCacheBasedLanguageModel::InstanceNonConst(id);
|
||||
if (cblm) cblm->ExecuteDlt(*dlt_meta_it);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// parse XML markup in translation line
|
||||
std::vector< size_t > xmlWalls;
|
||||
|
@ -537,21 +537,21 @@ bool StaticData::LoadData(Parameter *parameter)
|
||||
NoCache();
|
||||
OverrideFeatures();
|
||||
|
||||
std::cerr <<"After StaticData::LoadDataStatic" << std::endl;
|
||||
std::cerr <<"After StaticData::LoadDataStatic" << std::endl;
|
||||
|
||||
/*
|
||||
std::cerr <<"Before ShowWeights" << std::endl;
|
||||
// setting "-show-weights" -> just dump out weights and exit
|
||||
if (m_parameter->isParamSpecified("show-weights")) {
|
||||
MosesCmd::ShowWeights();
|
||||
exit(0);
|
||||
}
|
||||
std::cerr <<"After ShowWeights" << std::endl;
|
||||
*/
|
||||
/*
|
||||
std::cerr <<"Before ShowWeights" << std::endl;
|
||||
// setting "-show-weights" -> just dump out weights and exit
|
||||
if (m_parameter->isParamSpecified("show-weights")) {
|
||||
MosesCmd::ShowWeights();
|
||||
exit(0);
|
||||
}
|
||||
std::cerr <<"After ShowWeights" << std::endl;
|
||||
*/
|
||||
|
||||
std::cerr <<"Before LoadFeatureFunctions" << std::endl;
|
||||
std::cerr <<"Before LoadFeatureFunctions" << std::endl;
|
||||
LoadFeatureFunctions();
|
||||
std::cerr <<"After LoadFeatureFunctions" << std::endl;
|
||||
std::cerr <<"After LoadFeatureFunctions" << std::endl;
|
||||
|
||||
if (!LoadDecodeGraphs()) return false;
|
||||
|
||||
@ -982,8 +982,7 @@ bool StaticData::CheckWeights() const
|
||||
cerr << fname << "\n";
|
||||
if (featureNames.find(fname) != featureNames.end()) {
|
||||
weightNames.erase(iter++);
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
++iter;
|
||||
}
|
||||
}
|
||||
@ -1002,7 +1001,8 @@ bool StaticData::CheckWeights() const
|
||||
}
|
||||
|
||||
|
||||
void StaticData::LoadSparseWeightsFromConfig() {
|
||||
void StaticData::LoadSparseWeightsFromConfig()
|
||||
{
|
||||
set<string> featureNames;
|
||||
const std::vector<FeatureFunction*> &ffs = FeatureFunction::GetFeatureFunctions();
|
||||
for (size_t i = 0; i < ffs.size(); ++i) {
|
||||
@ -1017,7 +1017,7 @@ void StaticData::LoadSparseWeightsFromConfig() {
|
||||
// this indicates that it is sparse feature
|
||||
if (featureNames.find(iter->first) == featureNames.end()) {
|
||||
UTIL_THROW_IF2(iter->second.size() != 1, "ERROR: only one weight per sparse feature allowed: " << iter->first);
|
||||
m_allWeights.Assign(iter->first, iter->second[0]);
|
||||
m_allWeights.Assign(iter->first, iter->second[0]);
|
||||
}
|
||||
}
|
||||
|
||||
@ -1211,24 +1211,24 @@ void StaticData::ResetWeights(const std::string &denseWeights, const std::string
|
||||
vector<float> weights;
|
||||
vector<string> toks = Tokenize(denseWeights);
|
||||
for (size_t i = 0; i < toks.size(); ++i) {
|
||||
const string &tok = toks[i];
|
||||
const string &tok = toks[i];
|
||||
|
||||
if (tok.substr(tok.size() - 1, 1) == "=") {
|
||||
// start of new feature
|
||||
if (tok.substr(tok.size() - 1, 1) == "=") {
|
||||
// start of new feature
|
||||
|
||||
if (name != "") {
|
||||
// save previous ff
|
||||
const FeatureFunction &ff = FeatureFunction::FindFeatureFunction(name);
|
||||
m_allWeights.Assign(&ff, weights);
|
||||
weights.clear();
|
||||
}
|
||||
if (name != "") {
|
||||
// save previous ff
|
||||
const FeatureFunction &ff = FeatureFunction::FindFeatureFunction(name);
|
||||
m_allWeights.Assign(&ff, weights);
|
||||
weights.clear();
|
||||
}
|
||||
|
||||
name = tok.substr(0, tok.size() - 1);
|
||||
} else {
|
||||
// a weight for curr ff
|
||||
float weight = Scan<float>(toks[i]);
|
||||
weights.push_back(weight);
|
||||
}
|
||||
name = tok.substr(0, tok.size() - 1);
|
||||
} else {
|
||||
// a weight for curr ff
|
||||
float weight = Scan<float>(toks[i]);
|
||||
weights.push_back(weight);
|
||||
}
|
||||
}
|
||||
|
||||
const FeatureFunction &ff = FeatureFunction::FindFeatureFunction(name);
|
||||
@ -1238,14 +1238,14 @@ void StaticData::ResetWeights(const std::string &denseWeights, const std::string
|
||||
InputFileStream sparseStrme(sparseFile);
|
||||
string line;
|
||||
while (getline(sparseStrme, line)) {
|
||||
vector<string> toks = Tokenize(line);
|
||||
UTIL_THROW_IF2(toks.size() != 2, "Incorrect sparse weight format. Should be FFName_spareseName weight");
|
||||
vector<string> toks = Tokenize(line);
|
||||
UTIL_THROW_IF2(toks.size() != 2, "Incorrect sparse weight format. Should be FFName_spareseName weight");
|
||||
|
||||
vector<string> names = Tokenize(toks[0], "_");
|
||||
UTIL_THROW_IF2(names.size() != 2, "Incorrect sparse weight name. Should be FFName_spareseName");
|
||||
vector<string> names = Tokenize(toks[0], "_");
|
||||
UTIL_THROW_IF2(names.size() != 2, "Incorrect sparse weight name. Should be FFName_spareseName");
|
||||
|
||||
const FeatureFunction &ff = FeatureFunction::FindFeatureFunction(names[0]);
|
||||
m_allWeights.Assign(&ff, names[1], Scan<float>(toks[1]));
|
||||
const FeatureFunction &ff = FeatureFunction::FindFeatureFunction(names[0]);
|
||||
m_allWeights.Assign(&ff, names[1], Scan<float>(toks[1]));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -760,8 +760,9 @@ public:
|
||||
}
|
||||
|
||||
|
||||
bool AdjacentOnly() const
|
||||
{ return m_adjacentOnly; }
|
||||
bool AdjacentOnly() const {
|
||||
return m_adjacentOnly;
|
||||
}
|
||||
|
||||
|
||||
void ResetWeights(const std::string &denseWeights, const std::string &sparseFile);
|
||||
@ -769,11 +770,11 @@ public:
|
||||
|
||||
// need global access for output of tree structure
|
||||
const StatefulFeatureFunction* GetTreeStructure() const {
|
||||
return m_treeStructure;
|
||||
return m_treeStructure;
|
||||
}
|
||||
|
||||
void SetTreeStructure(const StatefulFeatureFunction* treeStructure) {
|
||||
m_treeStructure = treeStructure;
|
||||
m_treeStructure = treeStructure;
|
||||
}
|
||||
|
||||
};
|
||||
|
@ -3,17 +3,17 @@
|
||||
/***********************************************************************
|
||||
Moses - factored phrase-based language decoder
|
||||
Copyright (C) 2006 University of Edinburgh
|
||||
|
||||
|
||||
This library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
|
||||
This library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with this library; if not, write to the Free Software
|
||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
@ -38,7 +38,7 @@ using namespace std;
|
||||
|
||||
namespace Moses
|
||||
{
|
||||
TargetPhrase::TargetPhrase( std::string out_string)
|
||||
TargetPhrase::TargetPhrase( std::string out_string)
|
||||
:Phrase(0)
|
||||
, m_fullScore(0.0)
|
||||
, m_futureScore(0.0)
|
||||
@ -46,14 +46,14 @@ namespace Moses
|
||||
, m_alignNonTerm(&AlignmentInfoCollection::Instance().GetEmptyAlignmentInfo())
|
||||
, m_lhsTarget(NULL)
|
||||
, m_ruleSource(NULL)
|
||||
{
|
||||
|
||||
//ACAT
|
||||
const StaticData &staticData = StaticData::Instance();
|
||||
CreateFromString(Output, staticData.GetInputFactorOrder(), out_string, staticData.GetFactorDelimiter(), NULL);
|
||||
}
|
||||
|
||||
TargetPhrase::TargetPhrase()
|
||||
{
|
||||
|
||||
//ACAT
|
||||
const StaticData &staticData = StaticData::Instance();
|
||||
CreateFromString(Output, staticData.GetInputFactorOrder(), out_string, staticData.GetFactorDelimiter(), NULL);
|
||||
}
|
||||
|
||||
TargetPhrase::TargetPhrase()
|
||||
:Phrase()
|
||||
, m_fullScore(0.0)
|
||||
, m_futureScore(0.0)
|
||||
@ -61,10 +61,10 @@ namespace Moses
|
||||
, m_alignNonTerm(&AlignmentInfoCollection::Instance().GetEmptyAlignmentInfo())
|
||||
, m_lhsTarget(NULL)
|
||||
, m_ruleSource(NULL)
|
||||
{
|
||||
}
|
||||
|
||||
TargetPhrase::TargetPhrase(const Phrase &phrase)
|
||||
{
|
||||
}
|
||||
|
||||
TargetPhrase::TargetPhrase(const Phrase &phrase)
|
||||
: Phrase(phrase)
|
||||
, m_fullScore(0.0)
|
||||
, m_futureScore(0.0)
|
||||
@ -72,223 +72,223 @@ namespace Moses
|
||||
, m_alignNonTerm(&AlignmentInfoCollection::Instance().GetEmptyAlignmentInfo())
|
||||
, m_lhsTarget(NULL)
|
||||
, m_ruleSource(NULL)
|
||||
{
|
||||
}
|
||||
|
||||
TargetPhrase::TargetPhrase(const TargetPhrase ©)
|
||||
{
|
||||
}
|
||||
|
||||
TargetPhrase::TargetPhrase(const TargetPhrase ©)
|
||||
: Phrase(copy)
|
||||
, m_fullScore(copy.m_fullScore)
|
||||
, m_futureScore(copy.m_futureScore)
|
||||
, m_scoreBreakdown(copy.m_scoreBreakdown)
|
||||
, m_alignTerm(copy.m_alignTerm)
|
||||
, m_alignNonTerm(copy.m_alignNonTerm)
|
||||
{
|
||||
if (copy.m_lhsTarget) {
|
||||
m_lhsTarget = new Word(*copy.m_lhsTarget);
|
||||
} else {
|
||||
m_lhsTarget = NULL;
|
||||
}
|
||||
|
||||
if (copy.m_ruleSource) {
|
||||
m_ruleSource = new Phrase(*copy.m_ruleSource);
|
||||
} else {
|
||||
m_ruleSource = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
TargetPhrase::~TargetPhrase()
|
||||
{
|
||||
//cerr << "m_lhsTarget=" << m_lhsTarget << endl;
|
||||
|
||||
delete m_lhsTarget;
|
||||
delete m_ruleSource;
|
||||
}
|
||||
|
||||
#ifdef HAVE_PROTOBUF
|
||||
void TargetPhrase::WriteToRulePB(hgmert::Rule* pb) const
|
||||
{
|
||||
pb->add_trg_words("[X,1]");
|
||||
for (size_t pos = 0 ; pos < GetSize() ; pos++)
|
||||
pb->add_trg_words(GetWord(pos)[0]->GetString());
|
||||
}
|
||||
#endif
|
||||
|
||||
void TargetPhrase::Evaluate(const Phrase &source)
|
||||
{
|
||||
const std::vector<FeatureFunction*> &ffs = FeatureFunction::GetFeatureFunctions();
|
||||
Evaluate(source, ffs);
|
||||
}
|
||||
|
||||
void TargetPhrase::Evaluate(const Phrase &source, const std::vector<FeatureFunction*> &ffs)
|
||||
{
|
||||
if (ffs.size()) {
|
||||
const StaticData &staticData = StaticData::Instance();
|
||||
ScoreComponentCollection futureScoreBreakdown;
|
||||
for (size_t i = 0; i < ffs.size(); ++i) {
|
||||
const FeatureFunction &ff = *ffs[i];
|
||||
if (! staticData.IsFeatureFunctionIgnored( ff )) {
|
||||
ff.Evaluate(source, *this, m_scoreBreakdown, futureScoreBreakdown);
|
||||
}
|
||||
}
|
||||
|
||||
float weightedScore = m_scoreBreakdown.GetWeightedScore();
|
||||
m_futureScore += futureScoreBreakdown.GetWeightedScore();
|
||||
m_fullScore = weightedScore + m_futureScore;
|
||||
}
|
||||
}
|
||||
|
||||
void TargetPhrase::Evaluate(const InputType &input, const InputPath &inputPath)
|
||||
{
|
||||
const std::vector<FeatureFunction*> &ffs = FeatureFunction::GetFeatureFunctions();
|
||||
const StaticData &staticData = StaticData::Instance();
|
||||
ScoreComponentCollection futureScoreBreakdown;
|
||||
for (size_t i = 0; i < ffs.size(); ++i) {
|
||||
const FeatureFunction &ff = *ffs[i];
|
||||
if (! staticData.IsFeatureFunctionIgnored( ff )) {
|
||||
ff.Evaluate(input, inputPath, *this, m_scoreBreakdown, &futureScoreBreakdown);
|
||||
}
|
||||
}
|
||||
float weightedScore = m_scoreBreakdown.GetWeightedScore();
|
||||
m_futureScore += futureScoreBreakdown.GetWeightedScore();
|
||||
m_fullScore = weightedScore + m_futureScore;
|
||||
}
|
||||
|
||||
void TargetPhrase::SetXMLScore(float score)
|
||||
{
|
||||
const FeatureFunction* prod = PhraseDictionary::GetColl()[0];
|
||||
size_t numScores = prod->GetNumScoreComponents();
|
||||
vector <float> scoreVector(numScores,score/numScores);
|
||||
|
||||
m_scoreBreakdown.Assign(prod, scoreVector);
|
||||
}
|
||||
|
||||
void TargetPhrase::SetAlignmentInfo(const StringPiece &alignString)
|
||||
{
|
||||
AlignmentInfo::CollType alignTerm, alignNonTerm;
|
||||
for (util::TokenIter<util::AnyCharacter, true> token(alignString, util::AnyCharacter(" \t")); token; ++token) {
|
||||
util::TokenIter<util::SingleCharacter, false> dash(*token, util::SingleCharacter('-'));
|
||||
|
||||
char *endptr;
|
||||
size_t sourcePos = strtoul(dash->data(), &endptr, 10);
|
||||
UTIL_THROW_IF(endptr != dash->data() + dash->size(), util::ErrnoException, "Error parsing alignment" << *dash);
|
||||
++dash;
|
||||
size_t targetPos = strtoul(dash->data(), &endptr, 10);
|
||||
UTIL_THROW_IF(endptr != dash->data() + dash->size(), util::ErrnoException, "Error parsing alignment" << *dash);
|
||||
UTIL_THROW_IF2(++dash, "Extra gunk in alignment " << *token);
|
||||
|
||||
if (GetWord(targetPos).IsNonTerminal()) {
|
||||
alignNonTerm.insert(std::pair<size_t,size_t>(sourcePos, targetPos));
|
||||
} else {
|
||||
alignTerm.insert(std::pair<size_t,size_t>(sourcePos, targetPos));
|
||||
}
|
||||
}
|
||||
SetAlignTerm(alignTerm);
|
||||
SetAlignNonTerm(alignNonTerm);
|
||||
// cerr << "TargetPhrase::SetAlignmentInfo(const StringPiece &alignString) this:|" << *this << "|\n";
|
||||
}
|
||||
|
||||
void TargetPhrase::SetAlignTerm(const AlignmentInfo::CollType &coll)
|
||||
{
|
||||
const AlignmentInfo *alignmentInfo = AlignmentInfoCollection::Instance().Add(coll);
|
||||
m_alignTerm = alignmentInfo;
|
||||
|
||||
}
|
||||
|
||||
void TargetPhrase::SetAlignNonTerm(const AlignmentInfo::CollType &coll)
|
||||
{
|
||||
const AlignmentInfo *alignmentInfo = AlignmentInfoCollection::Instance().Add(coll);
|
||||
m_alignNonTerm = alignmentInfo;
|
||||
}
|
||||
|
||||
void TargetPhrase::SetSparseScore(const FeatureFunction* translationScoreProducer, const StringPiece &sparseString)
|
||||
{
|
||||
m_scoreBreakdown.Assign(translationScoreProducer, sparseString.as_string());
|
||||
}
|
||||
|
||||
void TargetPhrase::Merge(const TargetPhrase ©, const std::vector<FactorType>& factorVec)
|
||||
{
|
||||
Phrase::MergeFactors(copy, factorVec);
|
||||
m_scoreBreakdown.Merge(copy.GetScoreBreakdown());
|
||||
m_futureScore += copy.m_futureScore;
|
||||
m_fullScore += copy.m_fullScore;
|
||||
}
|
||||
|
||||
void TargetPhrase::SetProperties(const StringPiece &str)
|
||||
{
|
||||
if (str.size() == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
vector<string> toks;
|
||||
TokenizeMultiCharSeparator(toks, str.as_string(), "{{");
|
||||
for (size_t i = 0; i < toks.size(); ++i) {
|
||||
string &tok = toks[i];
|
||||
if (tok.empty()) {
|
||||
continue;
|
||||
}
|
||||
size_t endPos = tok.rfind("}");
|
||||
|
||||
tok = tok.substr(0, endPos - 1);
|
||||
|
||||
vector<string> keyValue = TokenizeFirstOnly(tok, " ");
|
||||
UTIL_THROW_IF2(keyValue.size() != 2,
|
||||
"Incorrect format of property: " << str);
|
||||
SetProperty(keyValue[0], keyValue[1]);
|
||||
}
|
||||
}
|
||||
|
||||
void TargetPhrase::GetProperty(const std::string &key, std::string &value, bool &found) const
|
||||
{
|
||||
std::map<std::string, std::string>::const_iterator iter;
|
||||
iter = m_properties.find(key);
|
||||
if (iter == m_properties.end()) {
|
||||
found = false;
|
||||
} else {
|
||||
found = true;
|
||||
value = iter->second;
|
||||
}
|
||||
}
|
||||
|
||||
void TargetPhrase::SetRuleSource(const Phrase &ruleSource) const
|
||||
{
|
||||
if (m_ruleSource == NULL) {
|
||||
m_ruleSource = new Phrase(ruleSource);
|
||||
}
|
||||
}
|
||||
|
||||
void swap(TargetPhrase &first, TargetPhrase &second)
|
||||
{
|
||||
first.SwapWords(second);
|
||||
std::swap(first.m_fullScore, second.m_fullScore);
|
||||
std::swap(first.m_futureScore, second.m_futureScore);
|
||||
swap(first.m_scoreBreakdown, second.m_scoreBreakdown);
|
||||
std::swap(first.m_alignTerm, second.m_alignTerm);
|
||||
std::swap(first.m_alignNonTerm, second.m_alignNonTerm);
|
||||
std::swap(first.m_lhsTarget, second.m_lhsTarget);
|
||||
}
|
||||
|
||||
TO_STRING_BODY(TargetPhrase);
|
||||
|
||||
std::ostream& operator<<(std::ostream& os, const TargetPhrase& tp)
|
||||
{
|
||||
if (tp.m_lhsTarget) {
|
||||
os << *tp.m_lhsTarget<< " -> ";
|
||||
}
|
||||
{
|
||||
if (copy.m_lhsTarget) {
|
||||
m_lhsTarget = new Word(*copy.m_lhsTarget);
|
||||
} else {
|
||||
m_lhsTarget = NULL;
|
||||
}
|
||||
|
||||
if (copy.m_ruleSource) {
|
||||
m_ruleSource = new Phrase(*copy.m_ruleSource);
|
||||
} else {
|
||||
m_ruleSource = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
TargetPhrase::~TargetPhrase()
|
||||
{
|
||||
//cerr << "m_lhsTarget=" << m_lhsTarget << endl;
|
||||
|
||||
delete m_lhsTarget;
|
||||
delete m_ruleSource;
|
||||
}
|
||||
|
||||
#ifdef HAVE_PROTOBUF
|
||||
void TargetPhrase::WriteToRulePB(hgmert::Rule* pb) const
|
||||
{
|
||||
pb->add_trg_words("[X,1]");
|
||||
for (size_t pos = 0 ; pos < GetSize() ; pos++)
|
||||
pb->add_trg_words(GetWord(pos)[0]->GetString());
|
||||
}
|
||||
#endif
|
||||
|
||||
void TargetPhrase::Evaluate(const Phrase &source)
|
||||
{
|
||||
const std::vector<FeatureFunction*> &ffs = FeatureFunction::GetFeatureFunctions();
|
||||
Evaluate(source, ffs);
|
||||
}
|
||||
|
||||
void TargetPhrase::Evaluate(const Phrase &source, const std::vector<FeatureFunction*> &ffs)
|
||||
{
|
||||
if (ffs.size()) {
|
||||
const StaticData &staticData = StaticData::Instance();
|
||||
ScoreComponentCollection futureScoreBreakdown;
|
||||
for (size_t i = 0; i < ffs.size(); ++i) {
|
||||
const FeatureFunction &ff = *ffs[i];
|
||||
if (! staticData.IsFeatureFunctionIgnored( ff )) {
|
||||
ff.Evaluate(source, *this, m_scoreBreakdown, futureScoreBreakdown);
|
||||
}
|
||||
}
|
||||
|
||||
float weightedScore = m_scoreBreakdown.GetWeightedScore();
|
||||
m_futureScore += futureScoreBreakdown.GetWeightedScore();
|
||||
m_fullScore = weightedScore + m_futureScore;
|
||||
}
|
||||
}
|
||||
|
||||
void TargetPhrase::Evaluate(const InputType &input, const InputPath &inputPath)
|
||||
{
|
||||
const std::vector<FeatureFunction*> &ffs = FeatureFunction::GetFeatureFunctions();
|
||||
const StaticData &staticData = StaticData::Instance();
|
||||
ScoreComponentCollection futureScoreBreakdown;
|
||||
for (size_t i = 0; i < ffs.size(); ++i) {
|
||||
const FeatureFunction &ff = *ffs[i];
|
||||
if (! staticData.IsFeatureFunctionIgnored( ff )) {
|
||||
ff.Evaluate(input, inputPath, *this, m_scoreBreakdown, &futureScoreBreakdown);
|
||||
}
|
||||
}
|
||||
float weightedScore = m_scoreBreakdown.GetWeightedScore();
|
||||
m_futureScore += futureScoreBreakdown.GetWeightedScore();
|
||||
m_fullScore = weightedScore + m_futureScore;
|
||||
}
|
||||
|
||||
void TargetPhrase::SetXMLScore(float score)
|
||||
{
|
||||
const FeatureFunction* prod = PhraseDictionary::GetColl()[0];
|
||||
size_t numScores = prod->GetNumScoreComponents();
|
||||
vector <float> scoreVector(numScores,score/numScores);
|
||||
|
||||
m_scoreBreakdown.Assign(prod, scoreVector);
|
||||
}
|
||||
|
||||
void TargetPhrase::SetAlignmentInfo(const StringPiece &alignString)
|
||||
{
|
||||
AlignmentInfo::CollType alignTerm, alignNonTerm;
|
||||
for (util::TokenIter<util::AnyCharacter, true> token(alignString, util::AnyCharacter(" \t")); token; ++token) {
|
||||
util::TokenIter<util::SingleCharacter, false> dash(*token, util::SingleCharacter('-'));
|
||||
|
||||
char *endptr;
|
||||
size_t sourcePos = strtoul(dash->data(), &endptr, 10);
|
||||
UTIL_THROW_IF(endptr != dash->data() + dash->size(), util::ErrnoException, "Error parsing alignment" << *dash);
|
||||
++dash;
|
||||
size_t targetPos = strtoul(dash->data(), &endptr, 10);
|
||||
UTIL_THROW_IF(endptr != dash->data() + dash->size(), util::ErrnoException, "Error parsing alignment" << *dash);
|
||||
UTIL_THROW_IF2(++dash, "Extra gunk in alignment " << *token);
|
||||
|
||||
if (GetWord(targetPos).IsNonTerminal()) {
|
||||
alignNonTerm.insert(std::pair<size_t,size_t>(sourcePos, targetPos));
|
||||
} else {
|
||||
alignTerm.insert(std::pair<size_t,size_t>(sourcePos, targetPos));
|
||||
}
|
||||
}
|
||||
SetAlignTerm(alignTerm);
|
||||
SetAlignNonTerm(alignNonTerm);
|
||||
// cerr << "TargetPhrase::SetAlignmentInfo(const StringPiece &alignString) this:|" << *this << "|\n";
|
||||
}
|
||||
|
||||
void TargetPhrase::SetAlignTerm(const AlignmentInfo::CollType &coll)
|
||||
{
|
||||
const AlignmentInfo *alignmentInfo = AlignmentInfoCollection::Instance().Add(coll);
|
||||
m_alignTerm = alignmentInfo;
|
||||
|
||||
}
|
||||
|
||||
void TargetPhrase::SetAlignNonTerm(const AlignmentInfo::CollType &coll)
|
||||
{
|
||||
const AlignmentInfo *alignmentInfo = AlignmentInfoCollection::Instance().Add(coll);
|
||||
m_alignNonTerm = alignmentInfo;
|
||||
}
|
||||
|
||||
void TargetPhrase::SetSparseScore(const FeatureFunction* translationScoreProducer, const StringPiece &sparseString)
|
||||
{
|
||||
m_scoreBreakdown.Assign(translationScoreProducer, sparseString.as_string());
|
||||
}
|
||||
|
||||
void TargetPhrase::Merge(const TargetPhrase ©, const std::vector<FactorType>& factorVec)
|
||||
{
|
||||
Phrase::MergeFactors(copy, factorVec);
|
||||
m_scoreBreakdown.Merge(copy.GetScoreBreakdown());
|
||||
m_futureScore += copy.m_futureScore;
|
||||
m_fullScore += copy.m_fullScore;
|
||||
}
|
||||
|
||||
void TargetPhrase::SetProperties(const StringPiece &str)
|
||||
{
|
||||
if (str.size() == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
vector<string> toks;
|
||||
TokenizeMultiCharSeparator(toks, str.as_string(), "{{");
|
||||
for (size_t i = 0; i < toks.size(); ++i) {
|
||||
string &tok = toks[i];
|
||||
if (tok.empty()) {
|
||||
continue;
|
||||
}
|
||||
size_t endPos = tok.rfind("}");
|
||||
|
||||
tok = tok.substr(0, endPos - 1);
|
||||
|
||||
vector<string> keyValue = TokenizeFirstOnly(tok, " ");
|
||||
UTIL_THROW_IF2(keyValue.size() != 2,
|
||||
"Incorrect format of property: " << str);
|
||||
SetProperty(keyValue[0], keyValue[1]);
|
||||
}
|
||||
}
|
||||
|
||||
void TargetPhrase::GetProperty(const std::string &key, std::string &value, bool &found) const
|
||||
{
|
||||
std::map<std::string, std::string>::const_iterator iter;
|
||||
iter = m_properties.find(key);
|
||||
if (iter == m_properties.end()) {
|
||||
found = false;
|
||||
} else {
|
||||
found = true;
|
||||
value = iter->second;
|
||||
}
|
||||
}
|
||||
|
||||
void TargetPhrase::SetRuleSource(const Phrase &ruleSource) const
|
||||
{
|
||||
if (m_ruleSource == NULL) {
|
||||
m_ruleSource = new Phrase(ruleSource);
|
||||
}
|
||||
}
|
||||
|
||||
void swap(TargetPhrase &first, TargetPhrase &second)
|
||||
{
|
||||
first.SwapWords(second);
|
||||
std::swap(first.m_fullScore, second.m_fullScore);
|
||||
std::swap(first.m_futureScore, second.m_futureScore);
|
||||
swap(first.m_scoreBreakdown, second.m_scoreBreakdown);
|
||||
std::swap(first.m_alignTerm, second.m_alignTerm);
|
||||
std::swap(first.m_alignNonTerm, second.m_alignNonTerm);
|
||||
std::swap(first.m_lhsTarget, second.m_lhsTarget);
|
||||
}
|
||||
|
||||
TO_STRING_BODY(TargetPhrase);
|
||||
|
||||
std::ostream& operator<<(std::ostream& os, const TargetPhrase& tp)
|
||||
{
|
||||
if (tp.m_lhsTarget) {
|
||||
os << *tp.m_lhsTarget<< " -> ";
|
||||
}
|
||||
|
||||
os << static_cast<const Phrase&>(tp) << ":" << flush;
|
||||
// os << tp.GetAlignNonTerm() << flush;
|
||||
os << ": term=" << tp.GetAlignTerm() << flush;
|
||||
os << ": nonterm=" << tp.GetAlignNonTerm() << flush;
|
||||
os << ": c=" << tp.m_fullScore << flush;
|
||||
os << " " << tp.m_scoreBreakdown << flush;
|
||||
|
||||
const Phrase *sourcePhrase = tp.GetRuleSource();
|
||||
if (sourcePhrase) {
|
||||
os << " sourcePhrase=" << *sourcePhrase << flush;
|
||||
}
|
||||
|
||||
return os;
|
||||
}
|
||||
|
||||
os << static_cast<const Phrase&>(tp) << ":" << flush;
|
||||
// os << tp.GetAlignNonTerm() << flush;
|
||||
os << ": term=" << tp.GetAlignTerm() << flush;
|
||||
os << ": nonterm=" << tp.GetAlignNonTerm() << flush;
|
||||
os << ": c=" << tp.m_fullScore << flush;
|
||||
os << " " << tp.m_scoreBreakdown << flush;
|
||||
|
||||
const Phrase *sourcePhrase = tp.GetRuleSource();
|
||||
if (sourcePhrase) {
|
||||
os << " sourcePhrase=" << *sourcePhrase << flush;
|
||||
}
|
||||
|
||||
return os;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
@ -136,22 +136,22 @@ SetFeaturesToApply()
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// tell the Phrase Dictionary that the TargetPhraseCollection is not needed any more
|
||||
void
|
||||
PhraseDictionary::
|
||||
Release(TargetPhraseCollection const* tpc) const
|
||||
{
|
||||
// do nothing by default
|
||||
return;
|
||||
}
|
||||
|
||||
bool
|
||||
PhraseDictionary::
|
||||
PrefixExists(Phrase const& phrase) const
|
||||
{
|
||||
return true;
|
||||
}
|
||||
// tell the Phrase Dictionary that the TargetPhraseCollection is not needed any more
|
||||
void
|
||||
PhraseDictionary::
|
||||
Release(TargetPhraseCollection const* tpc) const
|
||||
{
|
||||
// do nothing by default
|
||||
return;
|
||||
}
|
||||
|
||||
bool
|
||||
PhraseDictionary::
|
||||
PrefixExists(Phrase const& phrase) const
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
void
|
||||
PhraseDictionary::
|
||||
|
@ -91,7 +91,7 @@ public:
|
||||
void
|
||||
Release(TargetPhraseCollection const* tpc) const;
|
||||
|
||||
/// return true if phrase table entries starting with /phrase/
|
||||
/// return true if phrase table entries starting with /phrase/
|
||||
// exist in the table.
|
||||
virtual
|
||||
bool
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -1,17 +1,17 @@
|
||||
/***********************************************************************
|
||||
Moses - statistical machine translation system
|
||||
Copyright (C) 2006-2011 University of Edinburgh
|
||||
|
||||
|
||||
This library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
|
||||
This library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with this library; if not, write to the Free Software
|
||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
@ -43,126 +43,130 @@
|
||||
|
||||
namespace Moses
|
||||
{
|
||||
class ChartParser;
|
||||
class ChartCellCollectionBase;
|
||||
class ChartRuleLookupManager;
|
||||
|
||||
/** Implementation of a Cache-based phrase table.
|
||||
*/
|
||||
class PhraseDictionaryDynamicCacheBased : public PhraseDictionary
|
||||
{
|
||||
|
||||
typedef std::vector<unsigned int> AgeCollection;
|
||||
typedef std::pair<TargetPhraseCollection*, AgeCollection*> TargetCollectionAgePair;
|
||||
typedef std::map<Phrase, TargetCollectionAgePair> cacheMap;
|
||||
|
||||
// data structure for the cache
|
||||
cacheMap m_cacheTM;
|
||||
std::vector<Scores> precomputedScores;
|
||||
unsigned int m_maxAge;
|
||||
size_t m_score_type; //scoring type of the match
|
||||
size_t m_entries; //total number of entries in the cache
|
||||
float m_lower_score; //lower_bound_score for no match
|
||||
std::string m_initfiles; // vector of files loaded in the initialization phase
|
||||
std::string m_name; // internal name to identify this instance of the Cache-based phrase table
|
||||
|
||||
class ChartParser;
|
||||
class ChartCellCollectionBase;
|
||||
class ChartRuleLookupManager;
|
||||
|
||||
/** Implementation of a Cache-based phrase table.
|
||||
*/
|
||||
class PhraseDictionaryDynamicCacheBased : public PhraseDictionary
|
||||
{
|
||||
|
||||
typedef std::vector<unsigned int> AgeCollection;
|
||||
typedef std::pair<TargetPhraseCollection*, AgeCollection*> TargetCollectionAgePair;
|
||||
typedef std::map<Phrase, TargetCollectionAgePair> cacheMap;
|
||||
|
||||
// data structure for the cache
|
||||
cacheMap m_cacheTM;
|
||||
std::vector<Scores> precomputedScores;
|
||||
unsigned int m_maxAge;
|
||||
size_t m_score_type; //scoring type of the match
|
||||
size_t m_entries; //total number of entries in the cache
|
||||
float m_lower_score; //lower_bound_score for no match
|
||||
std::string m_initfiles; // vector of files loaded in the initialization phase
|
||||
std::string m_name; // internal name to identify this instance of the Cache-based phrase table
|
||||
|
||||
#ifdef WITH_THREADS
|
||||
//multiple readers - single writer lock
|
||||
mutable boost::shared_mutex m_cacheLock;
|
||||
//multiple readers - single writer lock
|
||||
mutable boost::shared_mutex m_cacheLock;
|
||||
#endif
|
||||
|
||||
friend std::ostream& operator<<(std::ostream&, const PhraseDictionaryDynamicCacheBased&);
|
||||
|
||||
public:
|
||||
PhraseDictionaryDynamicCacheBased(const std::string &line);
|
||||
~PhraseDictionaryDynamicCacheBased();
|
||||
|
||||
inline const std::string GetName() { return m_name; };
|
||||
inline void SetName(const std::string name){ m_name = name; }
|
||||
|
||||
static const PhraseDictionaryDynamicCacheBased& Instance(const std::string name) {
|
||||
UTIL_THROW_IF2(s_instance_map.find(name) == s_instance_map.end(), "The PhraseDictionaryDynamicCacheBased feature named " + name + " does not exist!");
|
||||
return *(s_instance_map[name]);
|
||||
}
|
||||
|
||||
static PhraseDictionaryDynamicCacheBased& InstanceNonConst(const std::string name) {
|
||||
UTIL_THROW_IF2(s_instance_map.find(name) == s_instance_map.end(), "The PhraseDictionaryDynamicCacheBased feature named " + name + " does not exist!");
|
||||
return *(s_instance_map[name]);
|
||||
}
|
||||
|
||||
static const PhraseDictionaryDynamicCacheBased& Instance() {
|
||||
return *s_instance;
|
||||
}
|
||||
|
||||
static PhraseDictionaryDynamicCacheBased& InstanceNonConst() {
|
||||
return *s_instance;
|
||||
}
|
||||
friend std::ostream& operator<<(std::ostream&, const PhraseDictionaryDynamicCacheBased&);
|
||||
|
||||
public:
|
||||
PhraseDictionaryDynamicCacheBased(const std::string &line);
|
||||
~PhraseDictionaryDynamicCacheBased();
|
||||
|
||||
inline const std::string GetName() {
|
||||
return m_name;
|
||||
};
|
||||
inline void SetName(const std::string name) {
|
||||
m_name = name;
|
||||
}
|
||||
|
||||
static const PhraseDictionaryDynamicCacheBased& Instance(const std::string name) {
|
||||
UTIL_THROW_IF2(s_instance_map.find(name) == s_instance_map.end(), "The PhraseDictionaryDynamicCacheBased feature named " + name + " does not exist!");
|
||||
return *(s_instance_map[name]);
|
||||
}
|
||||
|
||||
static PhraseDictionaryDynamicCacheBased& InstanceNonConst(const std::string name) {
|
||||
UTIL_THROW_IF2(s_instance_map.find(name) == s_instance_map.end(), "The PhraseDictionaryDynamicCacheBased feature named " + name + " does not exist!");
|
||||
return *(s_instance_map[name]);
|
||||
}
|
||||
|
||||
static const PhraseDictionaryDynamicCacheBased& Instance() {
|
||||
return *s_instance;
|
||||
}
|
||||
|
||||
static PhraseDictionaryDynamicCacheBased& InstanceNonConst() {
|
||||
return *s_instance;
|
||||
}
|
||||
|
||||
void Load();
|
||||
void Load(const std::string file);
|
||||
|
||||
const TargetPhraseCollection* GetTargetPhraseCollection(const Phrase &src) const;
|
||||
const TargetPhraseCollection* GetTargetPhraseCollectionNonCacheLEGACY(Phrase const &src) const;
|
||||
|
||||
// for phrase-based model
|
||||
// void GetTargetPhraseCollectionBatch(const InputPathList &inputPathQueue) const;
|
||||
|
||||
// for syntax/hiero model (CKY+ decoding)
|
||||
ChartRuleLookupManager* CreateRuleLookupManager(const ChartParser&, const ChartCellCollectionBase&, std::size_t);
|
||||
|
||||
void SetParameter(const std::string& key, const std::string& value);
|
||||
|
||||
void InitializeForInput(InputType const& source);
|
||||
|
||||
// virtual void InitializeForInput(InputType const&) {
|
||||
// /* Don't do anything source specific here as this object is shared between threads.*/
|
||||
// }
|
||||
|
||||
void Print() const; // prints the cache
|
||||
void Clear(); // clears the cache
|
||||
|
||||
void ClearEntries(std::string &entries);
|
||||
void ClearSource(std::string &entries);
|
||||
void Insert(std::string &entries);
|
||||
void Execute(std::string command);
|
||||
void ExecuteDlt(std::map<std::string, std::string> dlt_meta);
|
||||
|
||||
void SetScoreType(size_t type);
|
||||
void SetMaxAge(unsigned int age);
|
||||
|
||||
protected:
|
||||
static PhraseDictionaryDynamicCacheBased *s_instance;
|
||||
static std::map< const std::string, PhraseDictionaryDynamicCacheBased * > s_instance_map;
|
||||
|
||||
float decaying_score(const int age); // calculates the decay score given the age
|
||||
void Insert(std::vector<std::string> entries);
|
||||
|
||||
void Decay(); // traverse through the cache and decay each entry
|
||||
void Decay(Phrase p); // traverse through the cache and decay each entry for a given Phrase
|
||||
void Update(std::vector<std::string> entries, std::string ageString);
|
||||
void Update(std::string sourceString, std::string targetString, std::string ageString, std::string waString="");
|
||||
void Update(Phrase p, Phrase tp, int age, std::string waString="");
|
||||
|
||||
void ClearEntries(std::vector<std::string> entries);
|
||||
void ClearEntries(std::string sourceString, std::string targetString);
|
||||
void ClearEntries(Phrase p, Phrase tp);
|
||||
|
||||
void ClearSource(std::vector<std::string> entries);
|
||||
void ClearSource(Phrase sp);
|
||||
|
||||
void Execute(std::vector<std::string> commands);
|
||||
void Execute_Single_Command(std::string command);
|
||||
|
||||
|
||||
void SetPreComputedScores(const unsigned int numScoreComponent);
|
||||
Scores GetPreComputedScores(const unsigned int age);
|
||||
|
||||
void Load_Multiple_Files(std::vector<std::string> files);
|
||||
void Load_Single_File(const std::string file);
|
||||
|
||||
TargetPhrase *CreateTargetPhrase(const Phrase &sourcePhrase) const;
|
||||
};
|
||||
|
||||
void Load();
|
||||
void Load(const std::string file);
|
||||
|
||||
const TargetPhraseCollection* GetTargetPhraseCollection(const Phrase &src) const;
|
||||
const TargetPhraseCollection* GetTargetPhraseCollectionNonCacheLEGACY(Phrase const &src) const;
|
||||
|
||||
// for phrase-based model
|
||||
// void GetTargetPhraseCollectionBatch(const InputPathList &inputPathQueue) const;
|
||||
|
||||
// for syntax/hiero model (CKY+ decoding)
|
||||
ChartRuleLookupManager* CreateRuleLookupManager(const ChartParser&, const ChartCellCollectionBase&, std::size_t);
|
||||
|
||||
void SetParameter(const std::string& key, const std::string& value);
|
||||
|
||||
void InitializeForInput(InputType const& source);
|
||||
|
||||
// virtual void InitializeForInput(InputType const&) {
|
||||
// /* Don't do anything source specific here as this object is shared between threads.*/
|
||||
// }
|
||||
|
||||
void Print() const; // prints the cache
|
||||
void Clear(); // clears the cache
|
||||
|
||||
void ClearEntries(std::string &entries);
|
||||
void ClearSource(std::string &entries);
|
||||
void Insert(std::string &entries);
|
||||
void Execute(std::string command);
|
||||
void ExecuteDlt(std::map<std::string, std::string> dlt_meta);
|
||||
|
||||
void SetScoreType(size_t type);
|
||||
void SetMaxAge(unsigned int age);
|
||||
|
||||
protected:
|
||||
static PhraseDictionaryDynamicCacheBased *s_instance;
|
||||
static std::map< const std::string, PhraseDictionaryDynamicCacheBased * > s_instance_map;
|
||||
|
||||
float decaying_score(const int age); // calculates the decay score given the age
|
||||
void Insert(std::vector<std::string> entries);
|
||||
|
||||
void Decay(); // traverse through the cache and decay each entry
|
||||
void Decay(Phrase p); // traverse through the cache and decay each entry for a given Phrase
|
||||
void Update(std::vector<std::string> entries, std::string ageString);
|
||||
void Update(std::string sourceString, std::string targetString, std::string ageString, std::string waString="");
|
||||
void Update(Phrase p, Phrase tp, int age, std::string waString="");
|
||||
|
||||
void ClearEntries(std::vector<std::string> entries);
|
||||
void ClearEntries(std::string sourceString, std::string targetString);
|
||||
void ClearEntries(Phrase p, Phrase tp);
|
||||
|
||||
void ClearSource(std::vector<std::string> entries);
|
||||
void ClearSource(Phrase sp);
|
||||
|
||||
void Execute(std::vector<std::string> commands);
|
||||
void Execute_Single_Command(std::string command);
|
||||
|
||||
|
||||
void SetPreComputedScores(const unsigned int numScoreComponent);
|
||||
Scores GetPreComputedScores(const unsigned int age);
|
||||
|
||||
void Load_Multiple_Files(std::vector<std::string> files);
|
||||
void Load_Single_File(const std::string file);
|
||||
|
||||
TargetPhrase *CreateTargetPhrase(const Phrase &sourcePhrase) const;
|
||||
};
|
||||
|
||||
} // namespace Moses
|
||||
|
||||
#endif /* moses_PhraseDictionaryDynamicCacheBased_H_ */
|
||||
|
@ -65,7 +65,7 @@ PhraseDictionaryNodeMemory *PhraseDictionaryNodeMemory::GetOrCreateChild(const W
|
||||
PhraseDictionaryNodeMemory *PhraseDictionaryNodeMemory::GetOrCreateNonTerminalChild(const Word &targetNonTerm)
|
||||
{
|
||||
UTIL_THROW_IF2(!targetNonTerm.IsNonTerminal(),
|
||||
"Not a non-terminal: " << targetNonTerm);
|
||||
"Not a non-terminal: " << targetNonTerm);
|
||||
|
||||
return &m_nonTermMap[targetNonTerm];
|
||||
}
|
||||
@ -95,7 +95,7 @@ const PhraseDictionaryNodeMemory *PhraseDictionaryNodeMemory::GetChild(const Wor
|
||||
const PhraseDictionaryNodeMemory *PhraseDictionaryNodeMemory::GetNonTerminalChild(const Word &targetNonTerm) const
|
||||
{
|
||||
UTIL_THROW_IF2(!targetNonTerm.IsNonTerminal(),
|
||||
"Not a non-terminal: " << targetNonTerm);
|
||||
"Not a non-terminal: " << targetNonTerm);
|
||||
|
||||
NonTerminalMap::const_iterator p = m_nonTermMap.find(targetNonTerm);
|
||||
return (p == m_nonTermMap.end()) ? NULL : &p->second;
|
||||
|
@ -59,8 +59,8 @@ void PhraseDictionaryTransliteration::GetTargetPhraseCollection(InputPath &input
|
||||
|
||||
CacheColl &cache = GetCache();
|
||||
|
||||
CacheColl::iterator iter;
|
||||
iter = cache.find(hash);
|
||||
CacheColl::iterator iter;
|
||||
iter = cache.find(hash);
|
||||
|
||||
if (iter != cache.end()) {
|
||||
// already in cache
|
||||
|
@ -20,11 +20,11 @@ namespace Moses
|
||||
|
||||
/** constructor; just initialize the base class */
|
||||
TranslationOptionCollectionConfusionNet::
|
||||
TranslationOptionCollectionConfusionNet(const ConfusionNet &input,
|
||||
size_t maxNoTransOptPerCoverage,
|
||||
float translationOptionThreshold)
|
||||
: TranslationOptionCollection(input, maxNoTransOptPerCoverage,
|
||||
translationOptionThreshold)
|
||||
TranslationOptionCollectionConfusionNet(const ConfusionNet &input,
|
||||
size_t maxNoTransOptPerCoverage,
|
||||
float translationOptionThreshold)
|
||||
: TranslationOptionCollection(input, maxNoTransOptPerCoverage,
|
||||
translationOptionThreshold)
|
||||
{
|
||||
// Prefix checkers are phrase dictionaries that provide a prefix check
|
||||
// to indicate that a phrase table entry with a given prefix exists.
|
||||
@ -32,8 +32,8 @@ TranslationOptionCollectionConfusionNet(const ConfusionNet &input,
|
||||
// expanding it further.
|
||||
vector<PhraseDictionary*> prefixCheckers;
|
||||
BOOST_FOREACH(PhraseDictionary* pd, PhraseDictionary::GetColl())
|
||||
if (pd->ProvidesPrefixCheck()) prefixCheckers.push_back(pd);
|
||||
|
||||
if (pd->ProvidesPrefixCheck()) prefixCheckers.push_back(pd);
|
||||
|
||||
const InputFeature &inputFeature = InputFeature::Instance();
|
||||
UTIL_THROW_IF2(&inputFeature == NULL, "Input feature must be specified");
|
||||
|
||||
@ -103,10 +103,10 @@ TranslationOptionCollectionConfusionNet(const ConfusionNet &input,
|
||||
Phrase subphrase(prevPhrase);
|
||||
subphrase.AddWord(word);
|
||||
|
||||
bool OK = prefixCheckers.size() == 0;
|
||||
for (size_t k = 0; !OK && k < prefixCheckers.size(); ++k)
|
||||
OK = prefixCheckers[k]->PrefixExists(subphrase);
|
||||
if (!OK) continue;
|
||||
bool OK = prefixCheckers.size() == 0;
|
||||
for (size_t k = 0; !OK && k < prefixCheckers.size(); ++k)
|
||||
OK = prefixCheckers[k]->PrefixExists(subphrase);
|
||||
if (!OK) continue;
|
||||
|
||||
const ScorePair &scores = col[i].second;
|
||||
ScorePair *inputScore = new ScorePair(*prevInputScore);
|
||||
@ -122,8 +122,8 @@ TranslationOptionCollectionConfusionNet(const ConfusionNet &input,
|
||||
} // for (iterPath = prevPaths.begin(); iterPath != prevPaths.end(); ++iterPath) {
|
||||
}
|
||||
}
|
||||
// cerr << "HAVE " << m_inputPathQueue.size()
|
||||
// << " input paths of max. length "
|
||||
// cerr << "HAVE " << m_inputPathQueue.size()
|
||||
// << " input paths of max. length "
|
||||
// << maxSizePhrase << "." << endl;
|
||||
}
|
||||
|
||||
@ -249,9 +249,9 @@ void TranslationOptionCollectionConfusionNet::CreateTranslationOptionsForRangeLE
|
||||
// go thru each intermediate trans opt just created
|
||||
const vector<TranslationOption*>& partTransOptList = oldPtoc->GetList();
|
||||
vector<TranslationOption*>::const_iterator iterPartialTranslOpt;
|
||||
for (iterPartialTranslOpt = partTransOptList.begin();
|
||||
iterPartialTranslOpt != partTransOptList.end();
|
||||
++iterPartialTranslOpt) {
|
||||
for (iterPartialTranslOpt = partTransOptList.begin();
|
||||
iterPartialTranslOpt != partTransOptList.end();
|
||||
++iterPartialTranslOpt) {
|
||||
TranslationOption &inputPartialTranslOpt = **iterPartialTranslOpt;
|
||||
|
||||
if (transStep) {
|
||||
|
@ -136,12 +136,12 @@ void TranslationOptionCollectionLattice::CreateTranslationOptions()
|
||||
const WordsRange &range = path.GetWordsRange();
|
||||
|
||||
if (tpColl && tpColl->GetSize()) {
|
||||
TargetPhraseCollection::const_iterator iter;
|
||||
for (iter = tpColl->begin(); iter != tpColl->end(); ++iter) {
|
||||
const TargetPhrase &tp = **iter;
|
||||
TranslationOption *transOpt = new TranslationOption(range, tp);
|
||||
transOpt->SetInputPath(path);
|
||||
transOpt->Evaluate(m_source);
|
||||
TargetPhraseCollection::const_iterator iter;
|
||||
for (iter = tpColl->begin(); iter != tpColl->end(); ++iter) {
|
||||
const TargetPhrase &tp = **iter;
|
||||
TranslationOption *transOpt = new TranslationOption(range, tp);
|
||||
transOpt->SetInputPath(path);
|
||||
transOpt->Evaluate(m_source);
|
||||
|
||||
Add(transOpt);
|
||||
}
|
||||
|
596
moses/Util.cpp
596
moses/Util.cpp
@ -3,17 +3,17 @@
|
||||
/***********************************************************************
|
||||
Moses - factored phrase-based language decoder
|
||||
Copyright (C) 2006 University of Edinburgh
|
||||
|
||||
|
||||
This library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
|
||||
This library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with this library; if not, write to the Free Software
|
||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
@ -42,305 +42,301 @@ using namespace std;
|
||||
|
||||
namespace Moses
|
||||
{
|
||||
|
||||
//global variable
|
||||
Timer g_timer;
|
||||
|
||||
string GetTempFolder()
|
||||
{
|
||||
#ifdef _WIN32
|
||||
char *tmpPath = getenv("TMP");
|
||||
string str(tmpPath);
|
||||
if (str.substr(str.size() - 1, 1) != "\\")
|
||||
str += "\\";
|
||||
return str;
|
||||
#else
|
||||
return "/tmp/";
|
||||
#endif
|
||||
}
|
||||
|
||||
const std::string ToLower(const std::string& str)
|
||||
{
|
||||
std::string lc(str);
|
||||
std::transform(lc.begin(), lc.end(), lc.begin(), (int(*)(int))std::tolower);
|
||||
return lc;
|
||||
}
|
||||
|
||||
class BoolValueException : public util::Exception {};
|
||||
|
||||
template<>
|
||||
bool Scan<bool>(const std::string &input)
|
||||
{
|
||||
std::string lc = ToLower(input);
|
||||
if (lc == "yes" || lc == "y" || lc == "true" || lc == "1")
|
||||
return true;
|
||||
if (lc == "no" || lc == "n" || lc =="false" || lc == "0")
|
||||
return false;
|
||||
UTIL_THROW(BoolValueException, "Could not interpret " << input << " as a boolean. After lowercasing, valid values are yes, y, true, 1, no, n, false, and 0.");
|
||||
}
|
||||
|
||||
bool FileExists(const std::string& filePath)
|
||||
{
|
||||
ifstream ifs(filePath.c_str());
|
||||
return !ifs.fail();
|
||||
}
|
||||
|
||||
const std::string Trim(const std::string& str, const std::string dropChars)
|
||||
{
|
||||
std::string res = str;
|
||||
res.erase(str.find_last_not_of(dropChars)+1);
|
||||
return res.erase(0, res.find_first_not_of(dropChars));
|
||||
}
|
||||
|
||||
void ResetUserTime()
|
||||
{
|
||||
g_timer.start();
|
||||
};
|
||||
|
||||
void PrintUserTime(const std::string &message)
|
||||
{
|
||||
g_timer.check(message.c_str());
|
||||
}
|
||||
|
||||
double GetUserTime()
|
||||
{
|
||||
return g_timer.get_elapsed_time();
|
||||
}
|
||||
|
||||
std::vector< std::map<std::string, std::string> > ProcessAndStripDLT(std::string &line)
|
||||
{
|
||||
std::vector< std::map<std::string, std::string> > meta;
|
||||
std::string lline = ToLower(line);
|
||||
bool check_dlt = true;
|
||||
|
||||
//allowed format of dlt tag
|
||||
//<dlt type="name" id="name" attr="value"/>
|
||||
//the type attribute is mandatory; the name should not contain any double quotation mark
|
||||
//the id attribute is optional; if present, the name should not contain any double quotation mark
|
||||
//only one additional attribute is possible; value can contain double quotation marks
|
||||
//both name and value must be surrounded by double quotation mark
|
||||
|
||||
// std::cerr << "GLOBAL START" << endl;
|
||||
while (check_dlt) {
|
||||
size_t start = lline.find("<dlt");
|
||||
if (start == std::string::npos) {
|
||||
//no more dlt tags
|
||||
check_dlt = false;
|
||||
continue;
|
||||
}
|
||||
size_t close = lline.find("/>");
|
||||
if (close == std::string::npos) {
|
||||
// error: dlt tag is not ended
|
||||
check_dlt = false;
|
||||
continue;
|
||||
}
|
||||
//std::string dlt = Trim(lline.substr(start+4, close-start-4));
|
||||
std::string dlt = Trim(line.substr(start+4, close-start-4));
|
||||
|
||||
line.erase(start,close-start+2);
|
||||
lline.erase(start,close-start+2);
|
||||
|
||||
if (dlt != "") {
|
||||
std::map<std::string, std::string> tmp_meta;
|
||||
|
||||
//check if type is present and store it
|
||||
size_t start_type = dlt.find("type=");
|
||||
size_t len_type=4;
|
||||
if (start_type != std::string::npos) {
|
||||
//type is present
|
||||
//required format type="value"
|
||||
//double quotation mark is required
|
||||
|
||||
std::string val_type;
|
||||
std::string label_type = dlt.substr(start_type, len_type);
|
||||
if (dlt[start_type+len_type+1] == '"'){
|
||||
val_type = dlt.substr(start_type+len_type+2);
|
||||
size_t close_type = val_type.find('"');
|
||||
val_type = val_type.substr(0, close_type);
|
||||
dlt.erase(start_type,start_type+len_type+2+close_type+1);
|
||||
}
|
||||
else{
|
||||
TRACE_ERR("DLT parse error: missing character \" for type \n");
|
||||
}
|
||||
label_type = Trim(label_type);
|
||||
dlt = Trim(dlt);
|
||||
|
||||
tmp_meta[label_type] = val_type;
|
||||
}
|
||||
else{
|
||||
//type is not present
|
||||
UTIL_THROW(util::Exception, "ProcessAndStripDLT(std::string &line): Attribute type for dlt tag is mandatory.");
|
||||
}
|
||||
|
||||
//check if id is present and store it
|
||||
size_t start_id = dlt.find("id=");
|
||||
size_t len_id=2;
|
||||
if (start_id != std::string::npos) {
|
||||
//id is present
|
||||
//required format id="name"
|
||||
//double quotation mark is required
|
||||
|
||||
std::string val_id;
|
||||
std::string label_id = dlt.substr(start_id, len_id);
|
||||
if (dlt[start_id+len_id+1] == '"'){
|
||||
val_id = dlt.substr(start_id+len_id+2);
|
||||
size_t close_id = val_id.find('"');
|
||||
val_id = val_id.substr(0, close_id);
|
||||
dlt.erase(start_id,start_id+len_id+2+close_id+1);
|
||||
}
|
||||
else{
|
||||
TRACE_ERR("DLT parse error: missing character \" for id \n");
|
||||
}
|
||||
label_id = Trim(label_id);
|
||||
dlt = Trim(dlt);
|
||||
|
||||
tmp_meta[label_id] = val_id;
|
||||
}
|
||||
else{
|
||||
//id is not present
|
||||
//do nothing
|
||||
}
|
||||
|
||||
for (size_t i = 1; i < dlt.size(); i++) {
|
||||
if (dlt[i] == '=') {
|
||||
std::string label = dlt.substr(0, i);
|
||||
std::string val = dlt.substr(i+1);
|
||||
if (val[0] == '"') {
|
||||
|
||||
val = val.substr(1);
|
||||
// it admits any double quotation mark (but is attribute) in the value of the attribute
|
||||
// it assumes that just one attribute (besides id attribute) is present in the tag,
|
||||
// it assumes that the value starts and ends with double quotation mark
|
||||
size_t close = val.rfind('"');
|
||||
if (close == std::string::npos) {
|
||||
TRACE_ERR("SGML parse error: missing \"\n");
|
||||
dlt = "";
|
||||
i = 0;
|
||||
} else {
|
||||
dlt = val.substr(close+1);
|
||||
val = val.substr(0, close);
|
||||
i = 0;
|
||||
}
|
||||
} else {
|
||||
size_t close = val.find(' ');
|
||||
if (close == std::string::npos) {
|
||||
dlt = "";
|
||||
i = 0;
|
||||
} else {
|
||||
dlt = val.substr(close+1);
|
||||
val = val.substr(0, close);
|
||||
}
|
||||
}
|
||||
label = Trim(label);
|
||||
dlt = Trim(dlt);
|
||||
|
||||
tmp_meta[label] = val;
|
||||
}
|
||||
}
|
||||
|
||||
meta.push_back(tmp_meta);
|
||||
}
|
||||
}
|
||||
//global variable
|
||||
Timer g_timer;
|
||||
|
||||
string GetTempFolder()
|
||||
{
|
||||
#ifdef _WIN32
|
||||
char *tmpPath = getenv("TMP");
|
||||
string str(tmpPath);
|
||||
if (str.substr(str.size() - 1, 1) != "\\")
|
||||
str += "\\";
|
||||
return str;
|
||||
#else
|
||||
return "/tmp/";
|
||||
#endif
|
||||
}
|
||||
|
||||
const std::string ToLower(const std::string& str)
|
||||
{
|
||||
std::string lc(str);
|
||||
std::transform(lc.begin(), lc.end(), lc.begin(), (int(*)(int))std::tolower);
|
||||
return lc;
|
||||
}
|
||||
|
||||
class BoolValueException : public util::Exception {};
|
||||
|
||||
template<>
|
||||
bool Scan<bool>(const std::string &input)
|
||||
{
|
||||
std::string lc = ToLower(input);
|
||||
if (lc == "yes" || lc == "y" || lc == "true" || lc == "1")
|
||||
return true;
|
||||
if (lc == "no" || lc == "n" || lc =="false" || lc == "0")
|
||||
return false;
|
||||
UTIL_THROW(BoolValueException, "Could not interpret " << input << " as a boolean. After lowercasing, valid values are yes, y, true, 1, no, n, false, and 0.");
|
||||
}
|
||||
|
||||
bool FileExists(const std::string& filePath)
|
||||
{
|
||||
ifstream ifs(filePath.c_str());
|
||||
return !ifs.fail();
|
||||
}
|
||||
|
||||
const std::string Trim(const std::string& str, const std::string dropChars)
|
||||
{
|
||||
std::string res = str;
|
||||
res.erase(str.find_last_not_of(dropChars)+1);
|
||||
return res.erase(0, res.find_first_not_of(dropChars));
|
||||
}
|
||||
|
||||
void ResetUserTime()
|
||||
{
|
||||
g_timer.start();
|
||||
};
|
||||
|
||||
void PrintUserTime(const std::string &message)
|
||||
{
|
||||
g_timer.check(message.c_str());
|
||||
}
|
||||
|
||||
double GetUserTime()
|
||||
{
|
||||
return g_timer.get_elapsed_time();
|
||||
}
|
||||
|
||||
std::vector< std::map<std::string, std::string> > ProcessAndStripDLT(std::string &line)
|
||||
{
|
||||
std::vector< std::map<std::string, std::string> > meta;
|
||||
std::string lline = ToLower(line);
|
||||
bool check_dlt = true;
|
||||
|
||||
//allowed format of dlt tag
|
||||
//<dlt type="name" id="name" attr="value"/>
|
||||
//the type attribute is mandatory; the name should not contain any double quotation mark
|
||||
//the id attribute is optional; if present, the name should not contain any double quotation mark
|
||||
//only one additional attribute is possible; value can contain double quotation marks
|
||||
//both name and value must be surrounded by double quotation mark
|
||||
|
||||
// std::cerr << "GLOBAL START" << endl;
|
||||
while (check_dlt) {
|
||||
size_t start = lline.find("<dlt");
|
||||
if (start == std::string::npos) {
|
||||
//no more dlt tags
|
||||
check_dlt = false;
|
||||
continue;
|
||||
}
|
||||
size_t close = lline.find("/>");
|
||||
if (close == std::string::npos) {
|
||||
// error: dlt tag is not ended
|
||||
check_dlt = false;
|
||||
continue;
|
||||
}
|
||||
//std::string dlt = Trim(lline.substr(start+4, close-start-4));
|
||||
std::string dlt = Trim(line.substr(start+4, close-start-4));
|
||||
|
||||
line.erase(start,close-start+2);
|
||||
lline.erase(start,close-start+2);
|
||||
|
||||
if (dlt != "") {
|
||||
std::map<std::string, std::string> tmp_meta;
|
||||
|
||||
//check if type is present and store it
|
||||
size_t start_type = dlt.find("type=");
|
||||
size_t len_type=4;
|
||||
if (start_type != std::string::npos) {
|
||||
//type is present
|
||||
//required format type="value"
|
||||
//double quotation mark is required
|
||||
|
||||
std::string val_type;
|
||||
std::string label_type = dlt.substr(start_type, len_type);
|
||||
if (dlt[start_type+len_type+1] == '"') {
|
||||
val_type = dlt.substr(start_type+len_type+2);
|
||||
size_t close_type = val_type.find('"');
|
||||
val_type = val_type.substr(0, close_type);
|
||||
dlt.erase(start_type,start_type+len_type+2+close_type+1);
|
||||
} else {
|
||||
TRACE_ERR("DLT parse error: missing character \" for type \n");
|
||||
}
|
||||
label_type = Trim(label_type);
|
||||
dlt = Trim(dlt);
|
||||
|
||||
tmp_meta[label_type] = val_type;
|
||||
} else {
|
||||
//type is not present
|
||||
UTIL_THROW(util::Exception, "ProcessAndStripDLT(std::string &line): Attribute type for dlt tag is mandatory.");
|
||||
}
|
||||
|
||||
//check if id is present and store it
|
||||
size_t start_id = dlt.find("id=");
|
||||
size_t len_id=2;
|
||||
if (start_id != std::string::npos) {
|
||||
//id is present
|
||||
//required format id="name"
|
||||
//double quotation mark is required
|
||||
|
||||
std::string val_id;
|
||||
std::string label_id = dlt.substr(start_id, len_id);
|
||||
if (dlt[start_id+len_id+1] == '"') {
|
||||
val_id = dlt.substr(start_id+len_id+2);
|
||||
size_t close_id = val_id.find('"');
|
||||
val_id = val_id.substr(0, close_id);
|
||||
dlt.erase(start_id,start_id+len_id+2+close_id+1);
|
||||
} else {
|
||||
TRACE_ERR("DLT parse error: missing character \" for id \n");
|
||||
}
|
||||
label_id = Trim(label_id);
|
||||
dlt = Trim(dlt);
|
||||
|
||||
tmp_meta[label_id] = val_id;
|
||||
} else {
|
||||
//id is not present
|
||||
//do nothing
|
||||
}
|
||||
|
||||
for (size_t i = 1; i < dlt.size(); i++) {
|
||||
if (dlt[i] == '=') {
|
||||
std::string label = dlt.substr(0, i);
|
||||
std::string val = dlt.substr(i+1);
|
||||
if (val[0] == '"') {
|
||||
|
||||
val = val.substr(1);
|
||||
// it admits any double quotation mark (but is attribute) in the value of the attribute
|
||||
// it assumes that just one attribute (besides id attribute) is present in the tag,
|
||||
// it assumes that the value starts and ends with double quotation mark
|
||||
size_t close = val.rfind('"');
|
||||
if (close == std::string::npos) {
|
||||
TRACE_ERR("SGML parse error: missing \"\n");
|
||||
dlt = "";
|
||||
i = 0;
|
||||
} else {
|
||||
dlt = val.substr(close+1);
|
||||
val = val.substr(0, close);
|
||||
i = 0;
|
||||
}
|
||||
} else {
|
||||
size_t close = val.find(' ');
|
||||
if (close == std::string::npos) {
|
||||
dlt = "";
|
||||
i = 0;
|
||||
} else {
|
||||
dlt = val.substr(close+1);
|
||||
val = val.substr(0, close);
|
||||
}
|
||||
}
|
||||
label = Trim(label);
|
||||
dlt = Trim(dlt);
|
||||
|
||||
tmp_meta[label] = val;
|
||||
}
|
||||
}
|
||||
|
||||
meta.push_back(tmp_meta);
|
||||
}
|
||||
}
|
||||
// std::cerr << "GLOBAL END" << endl;
|
||||
return meta;
|
||||
}
|
||||
|
||||
std::map<std::string, std::string> ProcessAndStripSGML(std::string &line)
|
||||
{
|
||||
std::map<std::string, std::string> meta;
|
||||
std::string lline = ToLower(line);
|
||||
if (lline.find("<seg")!=0) return meta;
|
||||
size_t close = lline.find(">");
|
||||
if (close == std::string::npos) return meta; // error
|
||||
size_t end = lline.find("</seg>");
|
||||
std::string seg = Trim(lline.substr(4, close-4));
|
||||
std::string text = line.substr(close+1, end - close - 1);
|
||||
for (size_t i = 1; i < seg.size(); i++) {
|
||||
if (seg[i] == '=' && seg[i-1] == ' ') {
|
||||
std::string less = seg.substr(0, i-1) + seg.substr(i);
|
||||
seg = less;
|
||||
i = 0;
|
||||
continue;
|
||||
}
|
||||
if (seg[i] == '=' && seg[i+1] == ' ') {
|
||||
std::string less = seg.substr(0, i+1);
|
||||
if (i+2 < seg.size()) less += seg.substr(i+2);
|
||||
seg = less;
|
||||
i = 0;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
line = Trim(text);
|
||||
if (seg == "") return meta;
|
||||
for (size_t i = 1; i < seg.size(); i++) {
|
||||
if (seg[i] == '=') {
|
||||
std::string label = seg.substr(0, i);
|
||||
std::string val = seg.substr(i+1);
|
||||
if (val[0] == '"') {
|
||||
val = val.substr(1);
|
||||
size_t close = val.find('"');
|
||||
if (close == std::string::npos) {
|
||||
TRACE_ERR("SGML parse error: missing \"\n");
|
||||
seg = "";
|
||||
i = 0;
|
||||
} else {
|
||||
seg = val.substr(close+1);
|
||||
val = val.substr(0, close);
|
||||
i = 0;
|
||||
}
|
||||
} else {
|
||||
size_t close = val.find(' ');
|
||||
if (close == std::string::npos) {
|
||||
seg = "";
|
||||
i = 0;
|
||||
} else {
|
||||
seg = val.substr(close+1);
|
||||
val = val.substr(0, close);
|
||||
}
|
||||
}
|
||||
label = Trim(label);
|
||||
seg = Trim(seg);
|
||||
meta[label] = val;
|
||||
}
|
||||
}
|
||||
return meta;
|
||||
}
|
||||
|
||||
std::string PassthroughSGML(std::string &line, const std::string tagName, const std::string& lbrackStr, const std::string& rbrackStr)
|
||||
{
|
||||
string lbrack = lbrackStr; // = "<";
|
||||
string rbrack = rbrackStr; // = ">";
|
||||
|
||||
std::string meta = "";
|
||||
|
||||
std::string lline = ToLower(line);
|
||||
size_t open = lline.find(lbrack+tagName);
|
||||
//check whether the tag exists; if not return the empty string
|
||||
if (open == std::string::npos) return meta;
|
||||
|
||||
size_t close = lline.find(rbrack, open);
|
||||
//check whether the tag is closed with '/>'; if not return the empty string
|
||||
if (close == std::string::npos) {
|
||||
TRACE_ERR("PassthroughSGML error: the <passthrough info/> tag does not end properly\n");
|
||||
return meta;
|
||||
}
|
||||
// extract the tag
|
||||
std::string tmp = line.substr(open, close - open + 1);
|
||||
meta = line.substr(open, close - open + 1);
|
||||
|
||||
// strip the tag from the line
|
||||
line = line.substr(0, open) + line.substr(close + 1, std::string::npos);
|
||||
|
||||
TRACE_ERR("The input contains a <passthrough info/> tag:" << meta << std::endl);
|
||||
|
||||
lline = ToLower(line);
|
||||
open = lline.find(lbrack+tagName);
|
||||
if (open != std::string::npos) {
|
||||
TRACE_ERR("PassthroughSGML error: there are two <passthrough> tags\n");
|
||||
}
|
||||
return meta;
|
||||
}
|
||||
|
||||
return meta;
|
||||
}
|
||||
|
||||
std::map<std::string, std::string> ProcessAndStripSGML(std::string &line)
|
||||
{
|
||||
std::map<std::string, std::string> meta;
|
||||
std::string lline = ToLower(line);
|
||||
if (lline.find("<seg")!=0) return meta;
|
||||
size_t close = lline.find(">");
|
||||
if (close == std::string::npos) return meta; // error
|
||||
size_t end = lline.find("</seg>");
|
||||
std::string seg = Trim(lline.substr(4, close-4));
|
||||
std::string text = line.substr(close+1, end - close - 1);
|
||||
for (size_t i = 1; i < seg.size(); i++) {
|
||||
if (seg[i] == '=' && seg[i-1] == ' ') {
|
||||
std::string less = seg.substr(0, i-1) + seg.substr(i);
|
||||
seg = less;
|
||||
i = 0;
|
||||
continue;
|
||||
}
|
||||
if (seg[i] == '=' && seg[i+1] == ' ') {
|
||||
std::string less = seg.substr(0, i+1);
|
||||
if (i+2 < seg.size()) less += seg.substr(i+2);
|
||||
seg = less;
|
||||
i = 0;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
line = Trim(text);
|
||||
if (seg == "") return meta;
|
||||
for (size_t i = 1; i < seg.size(); i++) {
|
||||
if (seg[i] == '=') {
|
||||
std::string label = seg.substr(0, i);
|
||||
std::string val = seg.substr(i+1);
|
||||
if (val[0] == '"') {
|
||||
val = val.substr(1);
|
||||
size_t close = val.find('"');
|
||||
if (close == std::string::npos) {
|
||||
TRACE_ERR("SGML parse error: missing \"\n");
|
||||
seg = "";
|
||||
i = 0;
|
||||
} else {
|
||||
seg = val.substr(close+1);
|
||||
val = val.substr(0, close);
|
||||
i = 0;
|
||||
}
|
||||
} else {
|
||||
size_t close = val.find(' ');
|
||||
if (close == std::string::npos) {
|
||||
seg = "";
|
||||
i = 0;
|
||||
} else {
|
||||
seg = val.substr(close+1);
|
||||
val = val.substr(0, close);
|
||||
}
|
||||
}
|
||||
label = Trim(label);
|
||||
seg = Trim(seg);
|
||||
meta[label] = val;
|
||||
}
|
||||
}
|
||||
return meta;
|
||||
}
|
||||
|
||||
std::string PassthroughSGML(std::string &line, const std::string tagName, const std::string& lbrackStr, const std::string& rbrackStr)
|
||||
{
|
||||
string lbrack = lbrackStr; // = "<";
|
||||
string rbrack = rbrackStr; // = ">";
|
||||
|
||||
std::string meta = "";
|
||||
|
||||
std::string lline = ToLower(line);
|
||||
size_t open = lline.find(lbrack+tagName);
|
||||
//check whether the tag exists; if not return the empty string
|
||||
if (open == std::string::npos) return meta;
|
||||
|
||||
size_t close = lline.find(rbrack, open);
|
||||
//check whether the tag is closed with '/>'; if not return the empty string
|
||||
if (close == std::string::npos) {
|
||||
TRACE_ERR("PassthroughSGML error: the <passthrough info/> tag does not end properly\n");
|
||||
return meta;
|
||||
}
|
||||
// extract the tag
|
||||
std::string tmp = line.substr(open, close - open + 1);
|
||||
meta = line.substr(open, close - open + 1);
|
||||
|
||||
// strip the tag from the line
|
||||
line = line.substr(0, open) + line.substr(close + 1, std::string::npos);
|
||||
|
||||
TRACE_ERR("The input contains a <passthrough info/> tag:" << meta << std::endl);
|
||||
|
||||
lline = ToLower(line);
|
||||
open = lline.find(lbrack+tagName);
|
||||
if (open != std::string::npos) {
|
||||
TRACE_ERR("PassthroughSGML error: there are two <passthrough> tags\n");
|
||||
}
|
||||
return meta;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
@ -98,45 +98,44 @@ StringPiece Word::GetString(FactorType factorType) const
|
||||
|
||||
class StrayFactorException : public util::Exception {};
|
||||
|
||||
void
|
||||
void
|
||||
Word::
|
||||
CreateFromString(FactorDirection direction
|
||||
, const std::vector<FactorType> &factorOrder
|
||||
, const StringPiece &str
|
||||
, bool isNonTerminal
|
||||
, bool strict)
|
||||
, const std::vector<FactorType> &factorOrder
|
||||
, const StringPiece &str
|
||||
, bool isNonTerminal
|
||||
, bool strict)
|
||||
{
|
||||
FactorCollection &factorCollection = FactorCollection::Instance();
|
||||
vector<StringPiece> bits(MAX_NUM_FACTORS);
|
||||
util::TokenIter<util::MultiCharacter>
|
||||
fit(str, StaticData::Instance().GetFactorDelimiter());
|
||||
util::TokenIter<util::MultiCharacter>
|
||||
fit(str, StaticData::Instance().GetFactorDelimiter());
|
||||
size_t i = 0;
|
||||
for (; i < MAX_NUM_FACTORS && fit; ++i,++fit)
|
||||
bits[i] = *fit;
|
||||
if (i == MAX_NUM_FACTORS)
|
||||
UTIL_THROW_IF(fit, StrayFactorException,
|
||||
"The hard limit for factors is " << MAX_NUM_FACTORS
|
||||
<< ". The word " << str << " contains factor delimiter "
|
||||
<< StaticData::Instance().GetFactorDelimiter()
|
||||
<< " too many times.");
|
||||
UTIL_THROW_IF(fit, StrayFactorException,
|
||||
"The hard limit for factors is " << MAX_NUM_FACTORS
|
||||
<< ". The word " << str << " contains factor delimiter "
|
||||
<< StaticData::Instance().GetFactorDelimiter()
|
||||
<< " too many times.");
|
||||
if (strict)
|
||||
UTIL_THROW_IF(fit, StrayFactorException,
|
||||
"You have configured " << factorOrder.size()
|
||||
<< " factors but the word " << str
|
||||
<< " contains factor delimiter "
|
||||
<< StaticData::Instance().GetFactorDelimiter()
|
||||
<< " too many times.");
|
||||
|
||||
UTIL_THROW_IF(i < factorOrder.size(),util::Exception,
|
||||
"Too few factors in string '" << str << "'.");
|
||||
|
||||
for (size_t k = 0; k < factorOrder.size(); ++k)
|
||||
{
|
||||
UTIL_THROW_IF(factorOrder[k] >= MAX_NUM_FACTORS, util::Exception,
|
||||
"Factor order out of bounds.");
|
||||
m_factorArray[factorOrder[k]] = factorCollection.AddFactor(bits[k], isNonTerminal);
|
||||
}
|
||||
|
||||
UTIL_THROW_IF(fit, StrayFactorException,
|
||||
"You have configured " << factorOrder.size()
|
||||
<< " factors but the word " << str
|
||||
<< " contains factor delimiter "
|
||||
<< StaticData::Instance().GetFactorDelimiter()
|
||||
<< " too many times.");
|
||||
|
||||
UTIL_THROW_IF(i < factorOrder.size(),util::Exception,
|
||||
"Too few factors in string '" << str << "'.");
|
||||
|
||||
for (size_t k = 0; k < factorOrder.size(); ++k) {
|
||||
UTIL_THROW_IF(factorOrder[k] >= MAX_NUM_FACTORS, util::Exception,
|
||||
"Factor order out of bounds.");
|
||||
m_factorArray[factorOrder[k]] = factorCollection.AddFactor(bits[k], isNonTerminal);
|
||||
}
|
||||
|
||||
// assume term/non-term same for all factors
|
||||
m_isNonTerminal = isNonTerminal;
|
||||
}
|
||||
|
@ -152,7 +152,7 @@ public:
|
||||
, const std::vector<FactorType> &factorOrder
|
||||
, const StringPiece &str
|
||||
, bool isNonTerminal
|
||||
, bool strict = true);
|
||||
, bool strict = true);
|
||||
|
||||
void CreateUnknownWord(const Word &sourceWord);
|
||||
|
||||
|
@ -49,11 +49,11 @@ void WordLattice::Print(std::ostream& out) const
|
||||
out<<"\n\n";
|
||||
}
|
||||
|
||||
int
|
||||
int
|
||||
WordLattice::
|
||||
InitializeFromPCNDataType
|
||||
(const PCN::CN& cn,
|
||||
const std::vector<FactorType>& factorOrder,
|
||||
(const PCN::CN& cn,
|
||||
const std::vector<FactorType>& factorOrder,
|
||||
const std::string& debug_line)
|
||||
{
|
||||
const StaticData &staticData = StaticData::Instance();
|
||||
@ -78,20 +78,20 @@ InitializeFromPCNDataType
|
||||
|
||||
//check for correct number of link parameters
|
||||
if (alt.m_denseFeatures.size() != numInputScores) {
|
||||
TRACE_ERR("ERROR: need " << numInputScores
|
||||
<< " link parameters, found "
|
||||
<< alt.m_denseFeatures.size()
|
||||
<< " while reading column " << i
|
||||
<< " from " << debug_line << "\n");
|
||||
TRACE_ERR("ERROR: need " << numInputScores
|
||||
<< " link parameters, found "
|
||||
<< alt.m_denseFeatures.size()
|
||||
<< " while reading column " << i
|
||||
<< " from " << debug_line << "\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
//check each element for bounds
|
||||
std::vector<float>::const_iterator probsIterator;
|
||||
data[i][j].second = std::vector<float>(0);
|
||||
for(probsIterator = alt.m_denseFeatures.begin();
|
||||
probsIterator < alt.m_denseFeatures.end();
|
||||
probsIterator++) {
|
||||
for(probsIterator = alt.m_denseFeatures.begin();
|
||||
probsIterator < alt.m_denseFeatures.end();
|
||||
probsIterator++) {
|
||||
IFVERBOSE(1) {
|
||||
if (*probsIterator < 0.0f) {
|
||||
TRACE_ERR("WARN: neg probability: " << *probsIterator << "\n");
|
||||
|
@ -66,7 +66,7 @@ int WordsBitmap::GetFutureCosts(int lastPos) const
|
||||
bool WordsBitmap::IsAdjacent(size_t startPos, size_t endPos) const
|
||||
{
|
||||
if (GetNumWordsCovered() == 0) {
|
||||
return true;
|
||||
return true;
|
||||
}
|
||||
|
||||
size_t first = GetFirstGapPos();
|
||||
|
Loading…
Reference in New Issue
Block a user