mosesdecoder/moses/PP/SourceLabelsPhraseProperty.cpp

125 lines
4.2 KiB
C++
Raw Permalink Normal View History

2014-06-12 00:08:22 +04:00
#include "moses/PP/SourceLabelsPhraseProperty.h"
#include <iostream>
#include <cstdio>
#include <cstdlib>
#include <sstream>
#include <string>
#include <queue>
#include <cassert>
2014-06-12 00:08:22 +04:00
#include <limits>
namespace Moses
{
void SourceLabelsPhraseProperty::ProcessValue(const std::string &value)
2014-06-12 00:08:22 +04:00
{
std::istringstream tokenizer(value);
2014-06-12 00:08:22 +04:00
if (! (tokenizer >> m_nNTs)) { // first token: number of non-terminals (incl. left-hand side)
2014-08-08 00:02:51 +04:00
UTIL_THROW2("SourceLabelsPhraseProperty: Not able to read number of non-terminals. Flawed property? " << value);
2014-06-12 00:08:22 +04:00
}
assert( m_nNTs > 0 );
if (! (tokenizer >> m_totalCount)) { // second token: overall rule count
2014-08-08 00:02:51 +04:00
UTIL_THROW2("SourceLabelsPhraseProperty: Not able to read overall rule count. Flawed property? " << value);
2014-06-12 00:08:22 +04:00
}
assert( m_totalCount > 0.0 );
2015-01-14 14:07:42 +03:00
// read source-labelled rule items
2014-06-12 00:08:22 +04:00
std::priority_queue<float> ruleLabelledCountsPQ;
while (tokenizer.peek() != EOF) {
2014-08-08 00:02:51 +04:00
// try {
2014-06-12 00:08:22 +04:00
2015-01-14 14:07:42 +03:00
SourceLabelsPhrasePropertyItem item;
size_t numberOfLHSsGivenRHS = std::numeric_limits<std::size_t>::max();
2014-06-12 00:08:22 +04:00
2015-01-14 14:07:42 +03:00
if (m_nNTs == 1) {
2014-06-12 00:08:22 +04:00
2015-01-14 14:07:42 +03:00
item.m_sourceLabelsRHSCount = m_totalCount;
2014-06-12 00:08:22 +04:00
2015-01-14 14:07:42 +03:00
} else { // rule has right-hand side non-terminals, i.e. it's a hierarchical rule
2014-06-12 00:08:22 +04:00
2015-01-14 14:07:42 +03:00
for (size_t i=0; i<m_nNTs-1; ++i) { // RHS source non-terminal labels
size_t sourceLabelRHS;
if (! (tokenizer >> sourceLabelRHS) ) { // RHS source non-terminal label
UTIL_THROW2("SourceLabelsPhraseProperty: Not able to read right-hand side label index. Flawed property? " << value);
2014-06-12 00:08:22 +04:00
}
2015-01-14 14:07:42 +03:00
item.m_sourceLabelsRHS.push_back(sourceLabelRHS);
}
2014-06-12 00:08:22 +04:00
2015-01-14 14:07:42 +03:00
if (! (tokenizer >> item.m_sourceLabelsRHSCount)) {
UTIL_THROW2("SourceLabelsPhraseProperty: Not able to read right-hand side count. Flawed property? " << value);
}
2014-06-12 00:08:22 +04:00
2015-01-14 14:07:42 +03:00
if (! (tokenizer >> numberOfLHSsGivenRHS)) {
UTIL_THROW2("SourceLabelsPhraseProperty: Not able to read number of left-hand sides. Flawed property? " << value);
2014-06-12 00:08:22 +04:00
}
2015-01-14 14:07:42 +03:00
}
2014-06-12 00:08:22 +04:00
2015-01-14 14:07:42 +03:00
for (size_t i=0; i<numberOfLHSsGivenRHS && tokenizer.peek()!=EOF; ++i) { // LHS source non-terminal labels seen with this RHS
size_t sourceLabelLHS;
if (! (tokenizer >> sourceLabelLHS)) { // LHS source non-terminal label
UTIL_THROW2("SourceLabelsPhraseProperty: Not able to read left-hand side label index. Flawed property? " << value);
2014-06-12 00:08:22 +04:00
}
2015-01-14 14:07:42 +03:00
float ruleSourceLabelledCount;
if (! (tokenizer >> ruleSourceLabelledCount)) {
UTIL_THROW2("SourceLabelsPhraseProperty: Not able to read count. Flawed property? " << value);
}
item.m_sourceLabelsLHSList.push_back( std::make_pair(sourceLabelLHS,ruleSourceLabelledCount) );
ruleLabelledCountsPQ.push(ruleSourceLabelledCount);
}
2014-06-12 00:08:22 +04:00
2015-01-14 14:07:42 +03:00
m_sourceLabelItems.push_back(item);
2014-06-12 00:08:22 +04:00
2014-08-08 00:02:51 +04:00
// } catch (const std::exception &e) {
// UTIL_THROW2("SourceLabelsPhraseProperty: Read error. Flawed property?");
// }
2014-06-12 00:08:22 +04:00
}
// keep only top N label vectors
const size_t N=50;
if (ruleLabelledCountsPQ.size() > N) {
float topNRuleLabelledCount = std::numeric_limits<int>::max();
for (size_t i=0; !ruleLabelledCountsPQ.empty() && i<N; ++i) {
topNRuleLabelledCount = ruleLabelledCountsPQ.top();
ruleLabelledCountsPQ.pop();
}
size_t nKept=0;
std::list<SourceLabelsPhrasePropertyItem>::iterator itemIter=m_sourceLabelItems.begin();
while (itemIter!=m_sourceLabelItems.end()) {
if (itemIter->m_sourceLabelsRHSCount < topNRuleLabelledCount) {
itemIter = m_sourceLabelItems.erase(itemIter);
} else {
std::list< std::pair<size_t,float> >::iterator itemLHSIter=(itemIter->m_sourceLabelsLHSList).begin();
while (itemLHSIter!=(itemIter->m_sourceLabelsLHSList).end()) {
if (itemLHSIter->second < topNRuleLabelledCount) {
itemLHSIter = (itemIter->m_sourceLabelsLHSList).erase(itemLHSIter);
} else {
if (nKept >= N) {
itemLHSIter = (itemIter->m_sourceLabelsLHSList).erase(itemLHSIter,(itemIter->m_sourceLabelsLHSList).end());
} else {
++nKept;
++itemLHSIter;
}
}
}
if ((itemIter->m_sourceLabelsLHSList).empty()) {
itemIter = m_sourceLabelItems.erase(itemIter);
} else {
++itemIter;
}
}
}
}
};
} // namespace Moses