optionally ignore punctuation for glm

This commit is contained in:
Eva Hasler 2012-02-15 11:27:00 +00:00
parent 30cf66d180
commit 6ce2fed6ff
3 changed files with 44 additions and 21 deletions

View File

@ -9,19 +9,25 @@ using namespace std;
namespace Moses
{
GlobalLexicalModelUnlimited::GlobalLexicalModelUnlimited(const vector< FactorType >& inFactors,
const vector< FactorType >& outFactors)
const vector< FactorType >& outFactors,
bool ignorePunctuation)
: StatelessFeatureFunction("glm",ScoreProducer::unlimited),
m_sparseProducerWeight(1)
m_sparseProducerWeight(1),
m_ignorePunctuation(ignorePunctuation)
{
std::cerr << "Creating global lexical model unlimited...\n";
std::cerr << "Creating global lexical model unlimited.. ";
// load model
LoadData( inFactors, outFactors );
// compile a list of punctuation characters
/* char punctuation[] = "\"'!?¿·()#_,.:;•&@/\\0123456789~=";
for (size_t i=0; i < sizeof(punctuation)-1; ++i)
m_punctuationHash[punctuation[i]] = 1;*/
if (m_ignorePunctuation) {
cerr << "ignoring punctuation";
char punctuation[] = "\"'!?¿·()#_,.:;•&@/\\0123456789~=";
for (size_t i=0; i < sizeof(punctuation)-1; ++i)
m_punctuationHash[punctuation[i]] = 1;
}
cerr << endl;
}
GlobalLexicalModelUnlimited::~GlobalLexicalModelUnlimited(){}
@ -46,22 +52,26 @@ void GlobalLexicalModelUnlimited::Evaluate(const TargetPhrase& targetPhrase, Sco
for(size_t targetIndex = 0; targetIndex < targetPhrase.GetSize(); targetIndex++ ) {
string targetString = targetPhrase.GetWord(targetIndex).GetString(0); // TODO: change for other factors
// check if first char is punctuation
/* char firstChar = targetString.at(0);
CharHash::const_iterator charIterator = m_punctuationHash.find( firstChar );
if(charIterator != m_punctuationHash.end())
continue;*/
if (m_ignorePunctuation) {
// check if first char is punctuation
char firstChar = targetString.at(0);
CharHash::const_iterator charIterator = m_punctuationHash.find( firstChar );
if(charIterator != m_punctuationHash.end())
continue;
}
// set< const Word*, WordComparer > alreadyScored; // do not score a word twice
StringHash alreadyScored;
for(size_t inputIndex = 0; inputIndex < input.GetSize(); inputIndex++ ) {
string inputString = input.GetWord(inputIndex).GetString(0); // TODO: change for other factors
// check if first char is punctuation
/* firstChar = inputString.at(0);
CharHash::const_iterator charIterator = m_punctuationHash.find( firstChar );
if(charIterator != m_punctuationHash.end())
continue;*/
if (m_ignorePunctuation) {
// check if first char is punctuation
char firstChar = inputString.at(0);
CharHash::const_iterator charIterator = m_punctuationHash.find( firstChar );
if(charIterator != m_punctuationHash.end())
continue;
}
//if ( alreadyScored.find( &inputWord ) == alreadyScored.end() ) {
if ( alreadyScored.find(inputString) == alreadyScored.end()) {

View File

@ -33,7 +33,7 @@ class InputType;
class GlobalLexicalModelUnlimited : public StatelessFeatureFunction
{
// typedef std::map< char, short > CharHash;
typedef std::map< char, short > CharHash;
typedef std::map< std::string, short > StringHash;
struct ThreadLocalStorage
@ -51,7 +51,8 @@ private:
private:
const Sentence *m_input;
// CharHash m_punctuationHash;
CharHash m_punctuationHash;
bool m_ignorePunctuation;
std::vector< FactorType > m_inputFactors;
std::vector< FactorType > m_outputFactors;
@ -63,7 +64,8 @@ private:
public:
GlobalLexicalModelUnlimited(const std::vector< FactorType >& inFactors,
const std::vector< FactorType >& outFactors);
const std::vector< FactorType >& outFactors,
bool ignorePunctuation);
virtual ~GlobalLexicalModelUnlimited();

View File

@ -972,14 +972,25 @@ bool StaticData::LoadGlobalLexicalModelUnlimited()
}
for (size_t i = 0; i < weight.size(); i++ ) {
vector< string > factors = Tokenize(modelSpec[i],"-");
bool ignorePunctuation = false;
vector< string > factors;
vector< string > factors_punctuation = Tokenize(modelSpec[i]," ");
// read optional punctuation specification
if (factors_punctuation.size() > 0) {
factors = Tokenize(factors_punctuation[0],"-");
ignorePunctuation = Scan<int>(factors_punctuation[1]);
}
else
factors = Tokenize(modelSpec[i],"-");
if ( factors.size() != 2 ) {
std::cerr << "wrong factor definition for global lexical model unlimited: " << modelSpec[i] << endl;
return false;
}
const vector<FactorType> inputFactors = Tokenize<FactorType>(factors[0],",");
const vector<FactorType> outputFactors = Tokenize<FactorType>(factors[1],",");
m_globalLexicalModelsUnlimited.push_back(new GlobalLexicalModelUnlimited(inputFactors, outputFactors));
m_globalLexicalModelsUnlimited.push_back(new GlobalLexicalModelUnlimited(inputFactors, outputFactors, ignorePunctuation));
m_globalLexicalModelsUnlimited[i]->SetSparseProducerWeight(weight[i]);
}
return true;