mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-12-26 21:42:19 +03:00
Add option to dump OOVs to file
This commit is contained in:
parent
37494dd673
commit
bfae09725c
@ -80,13 +80,15 @@ public:
|
||||
OutputCollector* latticeSamplesCollector,
|
||||
OutputCollector* wordGraphCollector, OutputCollector* searchGraphCollector,
|
||||
OutputCollector* detailedTranslationCollector,
|
||||
OutputCollector* alignmentInfoCollector ) :
|
||||
OutputCollector* alignmentInfoCollector,
|
||||
OutputCollector* unknownsCollector) :
|
||||
m_source(source), m_lineNumber(lineNumber),
|
||||
m_outputCollector(outputCollector), m_nbestCollector(nbestCollector),
|
||||
m_latticeSamplesCollector(latticeSamplesCollector),
|
||||
m_wordGraphCollector(wordGraphCollector), m_searchGraphCollector(searchGraphCollector),
|
||||
m_detailedTranslationCollector(detailedTranslationCollector),
|
||||
m_alignmentInfoCollector(alignmentInfoCollector) {}
|
||||
m_alignmentInfoCollector(alignmentInfoCollector),
|
||||
m_unknownsCollector(unknownsCollector) {}
|
||||
|
||||
/** Translate one sentence
|
||||
* gets called by main function implemented at end of this source file */
|
||||
@ -268,6 +270,17 @@ public:
|
||||
m_detailedTranslationCollector->Write(m_lineNumber,out.str());
|
||||
}
|
||||
|
||||
//list of unknown words
|
||||
if (m_unknownsCollector) {
|
||||
const vector<Phrase*>& unknowns = manager.getSntTranslationOptions()->GetUnknownSources();
|
||||
ostringstream out;
|
||||
for (size_t i = 0; i < unknowns.size(); ++i) {
|
||||
out << *(unknowns[i]);
|
||||
}
|
||||
out << endl;
|
||||
m_unknownsCollector->Write(m_lineNumber, out.str());
|
||||
}
|
||||
|
||||
// report additional statistics
|
||||
IFVERBOSE(2) {
|
||||
PrintUserTime("Sentence Decoding Time:");
|
||||
@ -291,6 +304,7 @@ private:
|
||||
OutputCollector* m_searchGraphCollector;
|
||||
OutputCollector* m_detailedTranslationCollector;
|
||||
OutputCollector* m_alignmentInfoCollector;
|
||||
OutputCollector* m_unknownsCollector;
|
||||
std::ofstream *m_alignmentStream;
|
||||
|
||||
|
||||
@ -472,6 +486,18 @@ int main(int argc, char** argv)
|
||||
if (!staticData.GetAlignmentOutputFile().empty()) {
|
||||
alignmentInfoCollector.reset(new OutputCollector(ioWrapper->GetAlignmentOutputStream()));
|
||||
}
|
||||
|
||||
//initialise stream for unknown (oov) words
|
||||
auto_ptr<OutputCollector> unknownsCollector;
|
||||
auto_ptr<ofstream> unknownsStream;
|
||||
if (!staticData.GetOutputUnknownsFile().empty()) {
|
||||
unknownsStream.reset(new ofstream(staticData.GetOutputUnknownsFile().c_str()));
|
||||
if (!unknownsStream->good()) {
|
||||
TRACE_ERR("Unable to open " << staticData.GetOutputUnknownsFile() << " for unknowns");
|
||||
exit(1);
|
||||
}
|
||||
unknownsCollector.reset(new OutputCollector(unknownsStream.get()));
|
||||
}
|
||||
|
||||
#ifdef WITH_THREADS
|
||||
ThreadPool pool(staticData.ThreadCount());
|
||||
@ -492,7 +518,8 @@ int main(int argc, char** argv)
|
||||
wordGraphCollector.get(),
|
||||
searchGraphCollector.get(),
|
||||
detailedTranslationCollector.get(),
|
||||
alignmentInfoCollector.get() );
|
||||
alignmentInfoCollector.get(),
|
||||
unknownsCollector.get() );
|
||||
// execute task
|
||||
#ifdef WITH_THREADS
|
||||
pool.Submit(task);
|
||||
|
@ -148,6 +148,7 @@ Parameter::Parameter()
|
||||
AddParam("alignment-output-file", "print output word alignments into given file");
|
||||
AddParam("sort-word-alignment", "Sort word alignments for more consistent display. 0=no sort (default), 1=target order");
|
||||
AddParam("start-translation-id", "Id of 1st input. Default = 0");
|
||||
AddParam("output-unknowns", "Output the unknown (OOV) words to the given file, one line per sentence");
|
||||
|
||||
// Compact phrase table and reordering table.
|
||||
AddParam("minlexr-memory", "Load lexical reordering table in minlexr format into memory");
|
||||
|
@ -241,6 +241,16 @@ bool StaticData::LoadData(Parameter *parameter)
|
||||
#endif
|
||||
SetBooleanParameter( &m_unprunedSearchGraph, "unpruned-search-graph", false );
|
||||
SetBooleanParameter( &m_includeLHSInSearchGraph, "include-lhs-in-search-graph", false );
|
||||
|
||||
if (m_parameter->isParamSpecified("output-unknowns")) {
|
||||
|
||||
if (m_parameter->GetParam("output-unknowns").size() == 1) {
|
||||
m_outputUnknownsFile =Scan<string>(m_parameter->GetParam("output-unknowns")[0]);
|
||||
} else {
|
||||
UserMessage::Add(string("need to specify exactly one file name for unknowns"));
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// include feature names in the n-best list
|
||||
SetBooleanParameter( &m_labeledNBestList, "labeled-n-best-list", true );
|
||||
|
@ -198,6 +198,7 @@ protected:
|
||||
#endif
|
||||
bool m_unprunedSearchGraph; //! do not exclude dead ends (chart decoder only)
|
||||
bool m_includeLHSInSearchGraph; //! include LHS of rules in search graph
|
||||
std::string m_outputUnknownsFile; //! output unknowns in this file
|
||||
|
||||
size_t m_cubePruningPopLimit;
|
||||
size_t m_cubePruningDiversity;
|
||||
@ -564,6 +565,10 @@ public:
|
||||
return m_outputSearchGraphPB;
|
||||
}
|
||||
#endif
|
||||
const std::string& GetOutputUnknownsFile() const {
|
||||
return m_outputUnknownsFile;
|
||||
}
|
||||
|
||||
bool GetUnprunedSearchGraph() const {
|
||||
return m_unprunedSearchGraph;
|
||||
}
|
||||
|
@ -667,6 +667,11 @@ std::ostream& operator<<(std::ostream& out, const TranslationOptionCollection& c
|
||||
return out;
|
||||
}
|
||||
|
||||
const std::vector<Phrase*>& TranslationOptionCollection::GetUnknownSources() const
|
||||
{
|
||||
return m_unksrcs;
|
||||
}
|
||||
|
||||
void TranslationOptionCollection::CacheLexReordering()
|
||||
{
|
||||
const vector<LexicalReordering*> &lexReorderingModels = m_system->GetReorderModels();
|
||||
|
@ -96,6 +96,9 @@ protected:
|
||||
|
||||
//! implemented by inherited class, called by this class
|
||||
virtual void ProcessUnknownWord(size_t sourcePos)=0;
|
||||
|
||||
|
||||
|
||||
void CacheLexReordering();
|
||||
|
||||
public:
|
||||
@ -106,6 +109,9 @@ public:
|
||||
return m_source;
|
||||
}
|
||||
|
||||
//!List of unknowns (OOVs)
|
||||
const std::vector<Phrase*>& GetUnknownSources() const;
|
||||
|
||||
//! get length/size of source input
|
||||
size_t GetSize() const {
|
||||
return m_source.GetSize();
|
||||
|
Loading…
Reference in New Issue
Block a user