optionally ignore punctuation for glm

2024-12-28 14:32:38 +03:00 · 2012-02-15 11:27:00 +00:00 · 2012-02-15 11:27:00 +00:00 · 6ce2fed6ff
commit 6ce2fed6ff
parent 30cf66d180
3 changed files with 44 additions and 21 deletions
--- a/moses/src/GlobalLexicalModelUnlimited.cpp
+++ b/moses/src/GlobalLexicalModelUnlimited.cpp
@ -9,19 +9,25 @@ using namespace std;
 namespace Moses
 {
 GlobalLexicalModelUnlimited::GlobalLexicalModelUnlimited(const vector< FactorType >& inFactors,
-                                                         const vector< FactorType >& outFactors)
+                                                         const vector< FactorType >& outFactors,
+                                                         bool ignorePunctuation)
 : StatelessFeatureFunction("glm",ScoreProducer::unlimited),
-  m_sparseProducerWeight(1)
+  m_sparseProducerWeight(1),
+  m_ignorePunctuation(ignorePunctuation)
 {
-	std::cerr << "Creating global lexical model unlimited...\n";
+	std::cerr << "Creating global lexical model unlimited.. ";

 	// load model
 	LoadData( inFactors, outFactors );

 	// compile a list of punctuation characters
-/*	char punctuation[] = "\"'!?¿·()#_,.:;•&@‑/\\0123456789~=";
-	for (size_t i=0; i < sizeof(punctuation)-1; ++i)
-		m_punctuationHash[punctuation[i]] = 1;*/
+	if (m_ignorePunctuation) {
+		cerr << "ignoring punctuation";
+		char punctuation[] = "\"'!?¿·()#_,.:;•&@‑/\\0123456789~=";
+		for (size_t i=0; i < sizeof(punctuation)-1; ++i)
+			m_punctuationHash[punctuation[i]] = 1;
+	}
+	cerr << endl;
 }

 GlobalLexicalModelUnlimited::~GlobalLexicalModelUnlimited(){}
@ -46,22 +52,26 @@ void GlobalLexicalModelUnlimited::Evaluate(const TargetPhrase& targetPhrase, Sco
  for(size_t targetIndex = 0; targetIndex < targetPhrase.GetSize(); targetIndex++ ) {
  	string targetString = targetPhrase.GetWord(targetIndex).GetString(0); // TODO: change for other factors

-  	// check if first char is punctuation
-/*  	char firstChar = targetString.at(0);
-		CharHash::const_iterator charIterator = m_punctuationHash.find( firstChar );
-		if(charIterator != m_punctuationHash.end())
-			continue;*/
+  	if (m_ignorePunctuation) {
+  		// check if first char is punctuation
+  		char firstChar = targetString.at(0);
+  		CharHash::const_iterator charIterator = m_punctuationHash.find( firstChar );
+  		if(charIterator != m_punctuationHash.end())
+  			continue;
+  	}

 //  	set< const Word*, WordComparer > alreadyScored; // do not score a word twice
  	StringHash alreadyScored;
  	for(size_t inputIndex = 0; inputIndex < input.GetSize(); inputIndex++ ) {
  		string inputString = input.GetWord(inputIndex).GetString(0); // TODO: change for other factors

-  		// check if first char is punctuation
-/*  		firstChar = inputString.at(0);
-  		CharHash::const_iterator charIterator = m_punctuationHash.find( firstChar );
-  		if(charIterator != m_punctuationHash.end())
-  			continue;*/
+  		if (m_ignorePunctuation) {
+  			// check if first char is punctuation
+  			char firstChar = inputString.at(0);
+  			CharHash::const_iterator charIterator = m_punctuationHash.find( firstChar );
+  			if(charIterator != m_punctuationHash.end())
+  				continue;
+  		}

  		//if ( alreadyScored.find( &inputWord ) == alreadyScored.end() ) {
  		if ( alreadyScored.find(inputString) == alreadyScored.end()) {
--- a/moses/src/GlobalLexicalModelUnlimited.h
+++ b/moses/src/GlobalLexicalModelUnlimited.h
@ -33,7 +33,7 @@ class InputType;

 class GlobalLexicalModelUnlimited : public StatelessFeatureFunction
 {
-//	typedef std::map< char, short > CharHash;
+	typedef std::map< char, short > CharHash;
 	typedef std::map< std::string, short > StringHash;

  struct ThreadLocalStorage
@ -51,7 +51,8 @@ private:
 private:
  const Sentence *m_input;

-//  CharHash m_punctuationHash;
+  CharHash m_punctuationHash;
+  bool m_ignorePunctuation;

  std::vector< FactorType > m_inputFactors;
  std::vector< FactorType > m_outputFactors;
@ -63,7 +64,8 @@ private:

 public:
  GlobalLexicalModelUnlimited(const std::vector< FactorType >& inFactors,
-	                            const std::vector< FactorType >& outFactors);
+	                            const std::vector< FactorType >& outFactors,
+	                            bool ignorePunctuation);

  virtual ~GlobalLexicalModelUnlimited();

--- a/moses/src/StaticData.cpp
+++ b/moses/src/StaticData.cpp
@ -972,14 +972,25 @@ bool StaticData::LoadGlobalLexicalModelUnlimited()
  }

  for (size_t i = 0; i < weight.size(); i++ ) {
-	vector< string > factors = Tokenize(modelSpec[i],"-");
+    bool ignorePunctuation = false;
+    vector< string > factors;
+    vector< string > factors_punctuation = Tokenize(modelSpec[i]," ");
+
+    // read optional punctuation specification
+    if (factors_punctuation.size() > 0) {
+    	factors = Tokenize(factors_punctuation[0],"-");
+    	ignorePunctuation = Scan<int>(factors_punctuation[1]);
+    }
+    else
+    	factors = Tokenize(modelSpec[i],"-");
+
    if ( factors.size() != 2 ) {
      std::cerr << "wrong factor definition for global lexical model unlimited: " << modelSpec[i] << endl;
      return false;
    }
    const vector<FactorType> inputFactors = Tokenize<FactorType>(factors[0],",");
    const vector<FactorType> outputFactors = Tokenize<FactorType>(factors[1],",");
-    m_globalLexicalModelsUnlimited.push_back(new GlobalLexicalModelUnlimited(inputFactors, outputFactors));
+    m_globalLexicalModelsUnlimited.push_back(new GlobalLexicalModelUnlimited(inputFactors, outputFactors, ignorePunctuation));
    m_globalLexicalModelsUnlimited[i]->SetSparseProducerWeight(weight[i]);
  }
  return true;