mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-11-10 10:59:21 +03:00
add placeholder support to extract
This commit is contained in:
parent
89058b7935
commit
e3917f911b
@ -74,7 +74,6 @@ bool le(int, int);
|
||||
bool lt(int, int);
|
||||
|
||||
bool isAligned (SentenceAlignment &, int, int);
|
||||
bool isGoodPlaceholderRule (SentenceAlignment &, int, int);
|
||||
|
||||
int sentenceOffset = 0;
|
||||
|
||||
@ -105,6 +104,8 @@ private:
|
||||
void extract(SentenceAlignment &);
|
||||
void addPhrase(SentenceAlignment &, int, int, int, int, string &);
|
||||
void writePhrasesToFile();
|
||||
bool isGoodPlaceholderRule (const SentenceAlignment &sentence, int startE, int endE, int startF, int endF);
|
||||
bool isPlaceholder(const string &word);
|
||||
|
||||
SentenceAlignment &m_sentence;
|
||||
const PhraseExtractionOptions &m_options;
|
||||
@ -685,6 +686,10 @@ void ExtractTask::addPhrase( SentenceAlignment &sentence, int startE, int endE,
|
||||
return;
|
||||
}
|
||||
|
||||
if (!isGoodPlaceholderRule(sentence, startE, endE, startF, endF)) {
|
||||
return;
|
||||
}
|
||||
|
||||
for(int fi=startF; fi<=endF; fi++) {
|
||||
if (m_options.isTranslationFlag()) outextractstr << sentence.source[fi] << " ";
|
||||
if (m_options.isOrientationFlag()) outextractstrOrientation << sentence.source[fi] << " ";
|
||||
@ -806,6 +811,39 @@ void ExtractTask::extractBase( SentenceAlignment &sentence )
|
||||
|
||||
}
|
||||
|
||||
bool ExtractTask::isGoodPlaceholderRule (const SentenceAlignment &sentence, int startE, int endE, int startF, int endF)
|
||||
{
|
||||
for (size_t pos = startF; pos <= endF; ++pos) {
|
||||
const string &word = sentence.source[pos];
|
||||
if (isPlaceholder(word)) {
|
||||
if (sentence.alignedCountS[pos] != 1) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (size_t pos = startE; pos <= endE; ++pos) {
|
||||
const string &word = sentence.target[pos];
|
||||
if (isPlaceholder(word)) {
|
||||
if (sentence.alignedToT[pos].size() != 1) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool ExtractTask::isPlaceholder(const string &word)
|
||||
{
|
||||
for (size_t i = 0; i < m_options.placeholders.size(); ++i) {
|
||||
const string &placeholder = m_options.placeholders[i];
|
||||
if (word == placeholder) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
/** tokenise input string to vector of string. each element has been separated by a character in the delimiters argument.
|
||||
The separator can only be 1 character long. The default delimiters are space or tab
|
||||
*/
|
||||
@ -830,9 +868,4 @@ std::vector<std::string> Tokenize(const std::string& str,
|
||||
return tokens;
|
||||
}
|
||||
|
||||
bool isGoodPlaceholderRule (SentenceAlignment &, int, int)
|
||||
{
|
||||
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user