Add option to dump OOVs to file

This commit is contained in:
Barry Haddow 2012-09-21 08:55:37 +01:00
parent 37494dd673
commit bfae09725c
6 changed files with 57 additions and 3 deletions

View File

@ -80,13 +80,15 @@ public:
OutputCollector* latticeSamplesCollector,
OutputCollector* wordGraphCollector, OutputCollector* searchGraphCollector,
OutputCollector* detailedTranslationCollector,
OutputCollector* alignmentInfoCollector ) :
OutputCollector* alignmentInfoCollector,
OutputCollector* unknownsCollector) :
m_source(source), m_lineNumber(lineNumber),
m_outputCollector(outputCollector), m_nbestCollector(nbestCollector),
m_latticeSamplesCollector(latticeSamplesCollector),
m_wordGraphCollector(wordGraphCollector), m_searchGraphCollector(searchGraphCollector),
m_detailedTranslationCollector(detailedTranslationCollector),
m_alignmentInfoCollector(alignmentInfoCollector) {}
m_alignmentInfoCollector(alignmentInfoCollector),
m_unknownsCollector(unknownsCollector) {}
/** Translate one sentence
* gets called by main function implemented at end of this source file */
@ -268,6 +270,17 @@ public:
m_detailedTranslationCollector->Write(m_lineNumber,out.str());
}
//list of unknown words
if (m_unknownsCollector) {
const vector<Phrase*>& unknowns = manager.getSntTranslationOptions()->GetUnknownSources();
ostringstream out;
for (size_t i = 0; i < unknowns.size(); ++i) {
out << *(unknowns[i]);
}
out << endl;
m_unknownsCollector->Write(m_lineNumber, out.str());
}
// report additional statistics
IFVERBOSE(2) {
PrintUserTime("Sentence Decoding Time:");
@ -291,6 +304,7 @@ private:
OutputCollector* m_searchGraphCollector;
OutputCollector* m_detailedTranslationCollector;
OutputCollector* m_alignmentInfoCollector;
OutputCollector* m_unknownsCollector;
std::ofstream *m_alignmentStream;
@ -472,6 +486,18 @@ int main(int argc, char** argv)
if (!staticData.GetAlignmentOutputFile().empty()) {
alignmentInfoCollector.reset(new OutputCollector(ioWrapper->GetAlignmentOutputStream()));
}
//initialise stream for unknown (oov) words
auto_ptr<OutputCollector> unknownsCollector;
auto_ptr<ofstream> unknownsStream;
if (!staticData.GetOutputUnknownsFile().empty()) {
unknownsStream.reset(new ofstream(staticData.GetOutputUnknownsFile().c_str()));
if (!unknownsStream->good()) {
TRACE_ERR("Unable to open " << staticData.GetOutputUnknownsFile() << " for unknowns");
exit(1);
}
unknownsCollector.reset(new OutputCollector(unknownsStream.get()));
}
#ifdef WITH_THREADS
ThreadPool pool(staticData.ThreadCount());
@ -492,7 +518,8 @@ int main(int argc, char** argv)
wordGraphCollector.get(),
searchGraphCollector.get(),
detailedTranslationCollector.get(),
alignmentInfoCollector.get() );
alignmentInfoCollector.get(),
unknownsCollector.get() );
// execute task
#ifdef WITH_THREADS
pool.Submit(task);

View File

@ -148,6 +148,7 @@ Parameter::Parameter()
AddParam("alignment-output-file", "print output word alignments into given file");
AddParam("sort-word-alignment", "Sort word alignments for more consistent display. 0=no sort (default), 1=target order");
AddParam("start-translation-id", "Id of 1st input. Default = 0");
AddParam("output-unknowns", "Output the unknown (OOV) words to the given file, one line per sentence");
// Compact phrase table and reordering table.
AddParam("minlexr-memory", "Load lexical reordering table in minlexr format into memory");

View File

@ -241,6 +241,16 @@ bool StaticData::LoadData(Parameter *parameter)
#endif
SetBooleanParameter( &m_unprunedSearchGraph, "unpruned-search-graph", false );
SetBooleanParameter( &m_includeLHSInSearchGraph, "include-lhs-in-search-graph", false );
if (m_parameter->isParamSpecified("output-unknowns")) {
if (m_parameter->GetParam("output-unknowns").size() == 1) {
m_outputUnknownsFile =Scan<string>(m_parameter->GetParam("output-unknowns")[0]);
} else {
UserMessage::Add(string("need to specify exactly one file name for unknowns"));
return false;
}
}
// include feature names in the n-best list
SetBooleanParameter( &m_labeledNBestList, "labeled-n-best-list", true );

View File

@ -198,6 +198,7 @@ protected:
#endif
bool m_unprunedSearchGraph; //! do not exclude dead ends (chart decoder only)
bool m_includeLHSInSearchGraph; //! include LHS of rules in search graph
std::string m_outputUnknownsFile; //! output unknowns in this file
size_t m_cubePruningPopLimit;
size_t m_cubePruningDiversity;
@ -564,6 +565,10 @@ public:
return m_outputSearchGraphPB;
}
#endif
const std::string& GetOutputUnknownsFile() const {
return m_outputUnknownsFile;
}
bool GetUnprunedSearchGraph() const {
return m_unprunedSearchGraph;
}

View File

@ -667,6 +667,11 @@ std::ostream& operator<<(std::ostream& out, const TranslationOptionCollection& c
return out;
}
const std::vector<Phrase*>& TranslationOptionCollection::GetUnknownSources() const
{
return m_unksrcs;
}
void TranslationOptionCollection::CacheLexReordering()
{
const vector<LexicalReordering*> &lexReorderingModels = m_system->GetReorderModels();

View File

@ -96,6 +96,9 @@ protected:
//! implemented by inherited class, called by this class
virtual void ProcessUnknownWord(size_t sourcePos)=0;
void CacheLexReordering();
public:
@ -106,6 +109,9 @@ public:
return m_source;
}
//!List of unknowns (OOVs)
const std::vector<Phrase*>& GetUnknownSources() const;
//! get length/size of source input
size_t GetSize() const {
return m_source.GetSize();