Added option to output unknown words with prefix 'UNK'

This commit is contained in:
Jeremy Gwinnup 2013-08-15 10:56:20 -04:00
parent c002a81c96
commit 091bed03af
4 changed files with 16 additions and 1 deletions

View File

@ -196,6 +196,7 @@ void OutputSurface(std::ostream &out, const Hypothesis &edge, const std::vector<
{
CHECK(outputFactorOrder.size() > 0);
const Phrase& phrase = edge.GetCurrTargetPhrase();
bool markUnknown = StaticData::Instance().GetMarkUnknown();
if (reportAllFactors == true) {
out << phrase;
} else {
@ -212,8 +213,16 @@ void OutputSurface(std::ostream &out, const Hypothesis &edge, const std::vector<
}
}
CHECK(factor);
out << *factor;
//preface surface form with UNK if marking unknowns
const Word &word = phrase.GetWord(pos);
if(markUnknown && word.IsOOV()) {
out << "UNK" << *factor;
}
else {
out << *factor;
}
for (size_t i = 1 ; i < outputFactorOrder.size() ; i++) {
const Factor *factor = phrase.GetFactor(pos, outputFactorOrder[i]);
CHECK(factor);

View File

@ -51,6 +51,7 @@ Parameter::Parameter()
AddParam("input-file", "i", "location of the input file to be translated");
AddParam("inputtype", "text (0), confusion network (1), word lattice (2) (default = 0)");
AddParam("labeled-n-best-list", "print out labels for each weight type in n-best list. default is true");
AddParam("mark-unknown", "mu", "mark unknown words in output");
AddParam("max-partial-trans-opt", "maximum number of partial translation options per input span (during mapping steps)");
AddParam("max-trans-opt-per-coverage", "maximum number of translation options per input span (after applying mapping steps)");
AddParam("max-phrase-length", "maximum phrase length (default 20)");

View File

@ -364,6 +364,7 @@ bool StaticData::LoadData(Parameter *parameter)
// unknown word processing
SetBooleanParameter( &m_dropUnknown, "drop-unknown", false );
SetBooleanParameter( &m_markUnknown, "mark-unknown", false );
SetBooleanParameter( &m_lmEnableOOVFeature, "lmodel-oov-feature", false);

View File

@ -113,6 +113,7 @@ protected:
std::string m_nBestFilePath, m_latticeSamplesFilePath;
bool m_labeledNBestList,m_nBestIncludesSegmentation;
bool m_dropUnknown; //! false = treat unknown words as unknowns, and translate them as themselves; true = drop (ignore) them
bool m_markUnknown; //! false = treat unknown words as unknowns, and translate them as themselves; true = mark and (ignore) them
bool m_wordDeletionEnabled;
bool m_disableDiscarding;
@ -276,6 +277,9 @@ public:
inline bool GetDropUnknown() const {
return m_dropUnknown;
}
inline bool GetMarkUnknown() const {
return m_markUnknown;
}
inline bool GetDisableDiscarding() const {
return m_disableDiscarding;
}