add placeholder support to extract

2024-12-27 22:14:57 +03:00 · 2013-07-26 15:44:29 +01:00 · 2013-07-26 15:44:29 +01:00 · e3917f911b
commit e3917f911b
parent 89058b7935
1 changed files with 39 additions and 6 deletions
--- a/phrase-extract/extract-main.cpp
+++ b/phrase-extract/extract-main.cpp
@ -74,7 +74,6 @@ bool le(int, int);
 bool lt(int, int);

 bool isAligned (SentenceAlignment &, int, int);
-bool isGoodPlaceholderRule (SentenceAlignment &, int, int);

 int sentenceOffset = 0;

@ -105,6 +104,8 @@ private:
  void extract(SentenceAlignment &);
  void addPhrase(SentenceAlignment &, int, int, int, int, string &);
  void writePhrasesToFile();
+  bool isGoodPlaceholderRule (const SentenceAlignment &sentence, int startE, int endE, int startF, int endF);
+  bool isPlaceholder(const string &word);

  SentenceAlignment &m_sentence;
  const PhraseExtractionOptions &m_options;
@ -685,6 +686,10 @@ void ExtractTask::addPhrase( SentenceAlignment &sentence, int startE, int endE,
    return;
  }

+  if (!isGoodPlaceholderRule(sentence, startE, endE, startF, endF)) {
+	  return;
+  }
+
  for(int fi=startF; fi<=endF; fi++) {
    if (m_options.isTranslationFlag()) outextractstr << sentence.source[fi] << " ";
    if (m_options.isOrientationFlag()) outextractstrOrientation << sentence.source[fi] << " ";
@ -806,6 +811,39 @@ void ExtractTask::extractBase( SentenceAlignment &sentence )

 }

+bool ExtractTask::isGoodPlaceholderRule (const SentenceAlignment &sentence, int startE, int endE, int startF, int endF)
+{
+  for (size_t pos = startF; pos <= endF; ++pos) {
+    const string &word = sentence.source[pos];
+    if (isPlaceholder(word)) {
+    	if (sentence.alignedCountS[pos] != 1) {
+    		return false;
+    	}
+    }
+  }
+
+  for (size_t pos = startE; pos <= endE; ++pos) {
+      const string &word = sentence.target[pos];
+      if (isPlaceholder(word)) {
+      	if (sentence.alignedToT[pos].size() != 1) {
+      		return false;
+      	}
+      }
+  }
+
+  return true;
+}
+
+bool ExtractTask::isPlaceholder(const string &word)
+{
+  for (size_t i = 0; i < m_options.placeholders.size(); ++i) {
+	const string &placeholder = m_options.placeholders[i];
+	if (word == placeholder) {
+		return true;
+	}
+  }
+  return false;
+}
 /** tokenise input string to vector of string. each element has been separated by a character in the delimiters argument.
 		The separator can only be 1 character long. The default delimiters are space or tab
 */
@ -830,9 +868,4 @@ std::vector<std::string> Tokenize(const std::string& str,
  return tokens;
 }

-bool isGoodPlaceholderRule (SentenceAlignment &, int, int)
-{
-
-  return true;
-}
 }