add placeholder support to extract

This commit is contained in:
Hieu Hoang 2013-07-26 15:44:29 +01:00
parent 89058b7935
commit e3917f911b

View File

@ -74,7 +74,6 @@ bool le(int, int);
bool lt(int, int);
bool isAligned (SentenceAlignment &, int, int);
bool isGoodPlaceholderRule (SentenceAlignment &, int, int);
int sentenceOffset = 0;
@ -105,6 +104,8 @@ private:
void extract(SentenceAlignment &);
void addPhrase(SentenceAlignment &, int, int, int, int, string &);
void writePhrasesToFile();
bool isGoodPlaceholderRule (const SentenceAlignment &sentence, int startE, int endE, int startF, int endF);
bool isPlaceholder(const string &word);
SentenceAlignment &m_sentence;
const PhraseExtractionOptions &m_options;
@ -685,6 +686,10 @@ void ExtractTask::addPhrase( SentenceAlignment &sentence, int startE, int endE,
return;
}
if (!isGoodPlaceholderRule(sentence, startE, endE, startF, endF)) {
return;
}
for(int fi=startF; fi<=endF; fi++) {
if (m_options.isTranslationFlag()) outextractstr << sentence.source[fi] << " ";
if (m_options.isOrientationFlag()) outextractstrOrientation << sentence.source[fi] << " ";
@ -806,6 +811,39 @@ void ExtractTask::extractBase( SentenceAlignment &sentence )
}
bool ExtractTask::isGoodPlaceholderRule (const SentenceAlignment &sentence, int startE, int endE, int startF, int endF)
{
for (size_t pos = startF; pos <= endF; ++pos) {
const string &word = sentence.source[pos];
if (isPlaceholder(word)) {
if (sentence.alignedCountS[pos] != 1) {
return false;
}
}
}
for (size_t pos = startE; pos <= endE; ++pos) {
const string &word = sentence.target[pos];
if (isPlaceholder(word)) {
if (sentence.alignedToT[pos].size() != 1) {
return false;
}
}
}
return true;
}
bool ExtractTask::isPlaceholder(const string &word)
{
for (size_t i = 0; i < m_options.placeholders.size(); ++i) {
const string &placeholder = m_options.placeholders[i];
if (word == placeholder) {
return true;
}
}
return false;
}
/** tokenise input string to vector of string. each element has been separated by a character in the delimiters argument.
The separator can only be 1 character long. The default delimiters are space or tab
*/
@ -830,9 +868,4 @@ std::vector<std::string> Tokenize(const std::string& str,
return tokens;
}
bool isGoodPlaceholderRule (SentenceAlignment &, int, int)
{
return true;
}
}