fix word string for non-terminals, fix sparse feature weight for case of more than 1 DLM feature, fix factors for start and end symbol

This commit is contained in:
Eva Hasler 2012-01-09 12:04:18 +00:00
parent 24887b40f0
commit eaf940d5c1
4 changed files with 34 additions and 24 deletions

View File

@ -64,7 +64,7 @@ void ScoreComponentCollection::MultiplyEquals(float scalar)
// Multiply all weights of this sparse producer by a given scalar
void ScoreComponentCollection::MultiplyEquals(const ScoreProducer* sp, float scalar) {
assert(sp->GetNumScoreComponents() == ScoreProducer::unlimited);
std::string prefix = sp->GetScoreProducerWeightShortName() + FName::SEP;
std::string prefix = sp->GetScoreProducerDescription() + FName::SEP;
for(FVector::FNVmap::const_iterator i = m_scores.cbegin(); i != m_scores.cend(); i++) {
std::stringstream name;
name << i->first;

View File

@ -60,11 +60,6 @@ string TargetNgramFeature::GetScoreProducerWeightShortName(unsigned) const
return "dlm";
}
string TargetNgramFeature::GetShortNameWithSEP() const
{
return "dlm_";
}
size_t TargetNgramFeature::GetNumInputScores() const
{
return 0;
@ -187,8 +182,8 @@ void TargetNgramFeature::appendNgram(const Word& word, bool& skip, stringstream
FFState* TargetNgramFeature::EvaluateChart(const ChartHypothesis& cur_hypo, int featureId, ScoreComponentCollection* accumulator) const
{
vector<const Word*> contextFactor;
contextFactor.reserve(m_n);
vector<const Word*> contextFactor;
contextFactor.reserve(m_n);
// get index map for underlying hypotheses
const AlignmentInfo::NonTermIndexMap &nonTermIndexMap =
@ -219,18 +214,21 @@ FFState* TargetNgramFeature::EvaluateChart(const ChartHypothesis& cur_hypo, int
if (phrasePos==cur_hypo.GetCurrTargetPhrase().GetSize()-1 || prev_is_NT)
makeSuffix = true;
// beginning of sentence symbol <s>?
string w = word.GetString(m_factorType);
if (w.compare("<s>") == 0)
// beginning/end of sentence symbol <s>,</s>?
string factorZero = word.GetString(0);
if (factorZero.compare("<s>") == 0)
prefixTerminals++;
// end of sentence symbol </s>?
else if (w.compare("</s>") == 0)
else if (factorZero.compare("</s>") == 0)
suffixTerminals++;
// everything else
else {
stringstream ngram;
ngram << GetShortNameWithSEP();
ngram << word.GetString(m_factorType);
ngram << m_baseName;
if (m_factorType == 0)
ngram << factorZero;
else
ngram << word.GetString(m_factorType);
accumulator->SparsePlusEquals(ngram.str(), 1);
if (collectForPrefix)
@ -346,7 +344,7 @@ FFState* TargetNgramFeature::EvaluateChart(const ChartHypothesis& cur_hypo, int
// remove duplicates
stringstream curr_ngram;
curr_ngram << GetShortNameWithSEP();
curr_ngram << m_baseName;
curr_ngram << (*contextFactor[m_n-2]).GetString(m_factorType);
curr_ngram << ":";
curr_ngram << (*contextFactor[m_n-1]).GetString(m_factorType);
@ -373,7 +371,7 @@ FFState* TargetNgramFeature::EvaluateChart(const ChartHypothesis& cur_hypo, int
size_t size = contextFactor.size();
if (makePrefix && makeSuffix && (size <= m_n)) {
stringstream curr_ngram;
curr_ngram << GetShortNameWithSEP();
curr_ngram << m_baseName;
for (size_t i = 0; i < size; ++i) {
curr_ngram << (*contextFactor[i]).GetString(m_factorType);
if (i < size-1)
@ -394,11 +392,15 @@ void TargetNgramFeature::MakePrefixNgrams(std::vector<const Word*> &contextFacto
for (size_t k = 0; k < numberOfStartPos; ++k) {
size_t max_end = (size < m_n+k+offset)? size: m_n+k+offset;
for (size_t end_pos = 1+k+offset; end_pos < max_end; ++end_pos) {
ngram << GetShortNameWithSEP();
ngram << m_baseName;
for (size_t i=k+offset; i <= end_pos; ++i) {
if (i > k+offset)
ngram << ":";
ngram << (*contextFactor[i]).GetString(m_factorType);
string factorZero = (*contextFactor[i]).GetString(0);
if (m_factorType == 0 || factorZero.compare("<s>") == 0 || factorZero.compare("</s>") == 0)
ngram << factorZero;
else
ngram << (*contextFactor[i]).GetString(m_factorType);
const Word w = *contextFactor[i];
}
// cerr << "p-ngram: " << ngram.str() << endl;
@ -413,12 +415,16 @@ void TargetNgramFeature::MakeSuffixNgrams(std::vector<const Word*> &contextFacto
for (size_t k = 0; k < numberOfEndPos; ++k) {
size_t end_pos = contextFactor.size()-1-k-offset;
for (int start_pos=end_pos-1; (start_pos >= 0) && (end_pos-start_pos < m_n); --start_pos) {
ngram << GetShortNameWithSEP();
ngram << m_baseName;
for (size_t j=start_pos; j <= end_pos; ++j){
ngram << (*contextFactor[j]).GetString(m_factorType);
if (j < end_pos)
string factorZero = (*contextFactor[j]).GetString(0);
if (m_factorType == 0 || factorZero.compare("<s>") == 0 || factorZero.compare("</s>") == 0)
ngram << factorZero;
else
ngram << (*contextFactor[j]).GetString(m_factorType);
if (j < end_pos)
ngram << ":";
}
}
// cerr << "s-ngram: " << ngram.str() << endl;
accumulator->SparsePlusEquals(ngram.str(), 1);
ngram.str("");

View File

@ -190,12 +190,13 @@ public:
FactorCollection& factorCollection = FactorCollection::Instance();
const Factor* bosFactor = factorCollection.AddFactor(Output,m_factorType,BOS_);
m_bos.SetFactor(m_factorType,bosFactor);
m_baseName = GetScoreProducerDescription();
m_baseName.append("_");
}
bool Load(const std::string &filePath);
std::string GetScoreProducerWeightShortName(unsigned) const;
std::string GetShortNameWithSEP() const;
size_t GetNumInputScores() const;
void SetSparseProducerWeight(float weight) { m_sparseProducerWeight = weight; }
@ -219,6 +220,8 @@ private:
// additional weight that all sparse weights are scaled with
float m_sparseProducerWeight;
std::string m_baseName;
void appendNgram(const Word& word, bool& skip, std::stringstream& ngram) const;
void MakePrefixNgrams(std::vector<const Word*> &contextFactor, ScoreComponentCollection* accumulator,
size_t numberOfStartPos = 1, size_t offset = 0) const;

View File

@ -103,7 +103,8 @@ void Word::CreateFromString(FactorDirection direction
vector<string> wordVec;
Tokenize(wordVec, str, "|");
assert(wordVec.size() == factorOrder.size());
if (!isNonTerminal)
assert(wordVec.size() == factorOrder.size());
const Factor *factor;
for (size_t ind = 0; ind < wordVec.size(); ++ind) {