Refactor hypergraph output code

This commit is contained in:
Barry Haddow 2014-08-06 15:29:39 +01:00
parent 9106854ec7
commit c99a889420
5 changed files with 224 additions and 142 deletions

View File

@ -22,14 +22,6 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
/**
* Moses main, for single-threaded and multi-threaded.
**/
#include <boost/algorithm/string/predicate.hpp>
#include <boost/filesystem.hpp>
#include <boost/iostreams/device/file.hpp>
#include <boost/iostreams/filter/bzip2.hpp>
#include <boost/iostreams/filter/gzip.hpp>
#include <boost/iostreams/filtering_stream.hpp>
#include <exception>
#include <fstream>
#include <sstream>
@ -47,6 +39,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include "mbr.h"
#include "moses/Hypothesis.h"
#include "moses/HypergraphOutput.h"
#include "moses/Manager.h"
#include "moses/StaticData.h"
#include "moses/Util.h"
@ -95,7 +88,7 @@ public:
OutputCollector* alignmentInfoCollector,
OutputCollector* unknownsCollector,
bool outputSearchGraphSLF,
bool outputSearchGraphHypergraph) :
boost::shared_ptr<HypergraphOutput> hypergraphOutput) :
m_source(source), m_lineNumber(lineNumber),
m_outputCollector(outputCollector), m_nbestCollector(nbestCollector),
m_latticeSamplesCollector(latticeSamplesCollector),
@ -104,7 +97,7 @@ public:
m_alignmentInfoCollector(alignmentInfoCollector),
m_unknownsCollector(unknownsCollector),
m_outputSearchGraphSLF(outputSearchGraphSLF),
m_outputSearchGraphHypergraph(outputSearchGraphHypergraph) {}
m_hypergraphOutput(hypergraphOutput) {}
/** Translate one sentence
* gets called by main function implemented at end of this source file */
@ -184,105 +177,10 @@ public:
}
// Output search graph in hypergraph format for Kenneth Heafield's lazy hypergraph decoder
if (m_outputSearchGraphHypergraph) {
vector<string> hypergraphParameters = staticData.GetParam("output-search-graph-hypergraph");
bool appendSuffix;
if (hypergraphParameters.size() > 0 && hypergraphParameters[0] == "true") {
appendSuffix = true;
} else {
appendSuffix = false;
}
string compression;
if (hypergraphParameters.size() > 1) {
compression = hypergraphParameters[1];
} else {
compression = "txt";
}
string hypergraphDir;
if ( hypergraphParameters.size() > 2 ) {
hypergraphDir = hypergraphParameters[2];
} else {
string nbestFile = staticData.GetNBestFilePath();
if ( ! nbestFile.empty() && nbestFile!="-" && !boost::starts_with(nbestFile,"/dev/stdout") ) {
boost::filesystem::path nbestPath(nbestFile);
// In the Boost filesystem API version 2,
// which was the default prior to Boost 1.46,
// the filename() method returned a string.
//
// In the Boost filesystem API version 3,
// which is the default starting with Boost 1.46,
// the filename() method returns a path object.
//
// To get a string from the path object,
// the native() method must be called.
// hypergraphDir = nbestPath.parent_path().filename()
//#if BOOST_VERSION >= 104600
// .native()
//#endif
//;
// Hopefully the following compiles under all versions of Boost.
//
// If this line gives you compile errors,
// contact Lane Schwartz on the Moses mailing list
hypergraphDir = nbestPath.parent_path().string();
} else {
stringstream hypergraphDirName;
hypergraphDirName << boost::filesystem::current_path().string() << "/hypergraph";
hypergraphDir = hypergraphDirName.str();
}
}
if ( ! boost::filesystem::exists(hypergraphDir) ) {
boost::filesystem::create_directory(hypergraphDir);
}
if ( ! boost::filesystem::exists(hypergraphDir) ) {
TRACE_ERR("Cannot output hypergraphs to " << hypergraphDir << " because the directory does not exist" << std::endl);
} else if ( ! boost::filesystem::is_directory(hypergraphDir) ) {
TRACE_ERR("Cannot output hypergraphs to " << hypergraphDir << " because that path exists, but is not a directory" << std::endl);
} else {
stringstream fileName;
fileName << hypergraphDir << "/" << m_lineNumber;
if ( appendSuffix ) {
fileName << "." << compression;
}
boost::iostreams::filtering_ostream *file
= new boost::iostreams::filtering_ostream;
if ( compression == "gz" ) {
file->push( boost::iostreams::gzip_compressor() );
} else if ( compression == "bz2" ) {
file->push( boost::iostreams::bzip2_compressor() );
} else if ( compression != "txt" ) {
TRACE_ERR("Unrecognized hypergraph compression format ("
<< compression
<< ") - using uncompressed plain txt" << std::endl);
compression = "txt";
}
file->push( boost::iostreams::file_sink(fileName.str(), ios_base::out) );
if (file->is_complete() && file->good()) {
fix(*file,PRECISION);
manager.OutputSearchGraphAsHypergraph(m_lineNumber, *file);
file -> flush();
} else {
TRACE_ERR("Cannot output hypergraph for line " << m_lineNumber
<< " because the output file " << fileName.str()
<< " is not open or not ready for writing"
<< std::endl);
}
file -> pop();
delete file;
}
if (m_hypergraphOutput.get()) {
m_hypergraphOutput->Write(manager);
}
additionalReportingTime.stop();
// apply decision rule and output best translation(s)
@ -476,7 +374,7 @@ private:
OutputCollector* m_alignmentInfoCollector;
OutputCollector* m_unknownsCollector;
bool m_outputSearchGraphSLF;
bool m_outputSearchGraphHypergraph;
boost::shared_ptr<HypergraphOutput> m_hypergraphOutput;
std::ofstream *m_alignmentStream;
@ -591,30 +489,9 @@ int main(int argc, char** argv)
TRACE_ERR(weights);
TRACE_ERR("\n");
}
boost::shared_ptr<HypergraphOutput> hypergraphOutput;
if (staticData.GetOutputSearchGraphHypergraph()) {
ofstream* weightsOut = new std::ofstream;
stringstream weightsFilename;
if (staticData.GetParam("output-search-graph-hypergraph").size() > 3) {
weightsFilename << staticData.GetParam("output-search-graph-hypergraph")[3];
} else {
string nbestFile = staticData.GetNBestFilePath();
if ( ! nbestFile.empty() && nbestFile!="-" && !boost::starts_with(nbestFile,"/dev/stdout") ) {
boost::filesystem::path nbestPath(nbestFile);
weightsFilename << nbestPath.parent_path().filename() << "/weights";
} else {
weightsFilename << boost::filesystem::current_path().string() << "/hypergraph/weights";
}
}
boost::filesystem::path weightsFilePath(weightsFilename.str());
if ( ! boost::filesystem::exists(weightsFilePath.parent_path()) ) {
boost::filesystem::create_directory(weightsFilePath.parent_path());
}
TRACE_ERR("The weights file is " << weightsFilename.str() << "\n");
weightsOut->open(weightsFilename.str().c_str());
OutputFeatureWeightsForHypergraph(*weightsOut);
weightsOut->flush();
weightsOut->close();
delete weightsOut;
hypergraphOutput.reset(new HypergraphOutput(PRECISION));
}
@ -725,7 +602,7 @@ int main(int argc, char** argv)
alignmentInfoCollector.get(),
unknownsCollector.get(),
staticData.GetOutputSearchGraphSLF(),
staticData.GetOutputSearchGraphHypergraph());
hypergraphOutput);
// execute task
#ifdef WITH_THREADS
pool.Submit(task);

153
moses/HypergraphOutput.cpp Normal file
View File

@ -0,0 +1,153 @@
// $Id$
// vim:tabstop=2
/***********************************************************************
Moses - factored phrase-based language decoder
Copyright (C) 2014- University of Edinburgh
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#include <fstream>
#include <sstream>
#include <string>
#include <vector>
#include <boost/algorithm/string/predicate.hpp>
#include <boost/filesystem.hpp>
#include <boost/iostreams/device/file.hpp>
#include <boost/iostreams/filter/bzip2.hpp>
#include <boost/iostreams/filter/gzip.hpp>
#include <boost/iostreams/filtering_stream.hpp>
#include <util/exception.hh>
#include "HypergraphOutput.h"
#include "Manager.h"
using namespace std;
namespace Moses {
HypergraphOutput::HypergraphOutput(size_t precision) :
m_precision(precision) {
const StaticData& staticData = StaticData::Instance();
vector<string> hypergraphParameters = staticData.GetParam("output-search-graph-hypergraph");
if (hypergraphParameters.size() > 0 && hypergraphParameters[0] == "true") {
m_appendSuffix = true;
} else {
m_appendSuffix = false;
}
string compression;
if (hypergraphParameters.size() > 1) {
m_compression = hypergraphParameters[1];
} else {
m_compression = "txt";
}
UTIL_THROW_IF(m_compression != "txt" && m_compression != "gz" && m_compression != "bz2",
util::Exception, "Unknown compression type: " << m_compression);
if ( hypergraphParameters.size() > 2 ) {
m_hypergraphDir = hypergraphParameters[2];
} else {
string nbestFile = staticData.GetNBestFilePath();
if ( ! nbestFile.empty() && nbestFile!="-" && !boost::starts_with(nbestFile,"/dev/stdout") ) {
boost::filesystem::path nbestPath(nbestFile);
// In the Boost filesystem API version 2,
// which was the default prior to Boost 1.46,
// the filename() method returned a string.
//
// In the Boost filesystem API version 3,
// which is the default starting with Boost 1.46,
// the filename() method returns a path object.
//
// To get a string from the path object,
// the native() method must be called.
// hypergraphDir = nbestPath.parent_path().filename()
//#if BOOST_VERSION >= 104600
// .native()
//#endif
//;
// Hopefully the following compiles under all versions of Boost.
//
// If this line gives you compile errors,
// contact Lane Schwartz on the Moses mailing list
m_hypergraphDir = nbestPath.parent_path().string();
} else {
stringstream hypergraphDirName;
hypergraphDirName << boost::filesystem::current_path().string() << "/hypergraph";
m_hypergraphDir = hypergraphDirName.str();
}
}
if ( ! boost::filesystem::exists(m_hypergraphDir) ) {
boost::filesystem::create_directory(m_hypergraphDir);
}
UTIL_THROW_IF(!boost::filesystem::is_directory(m_hypergraphDir),
util::Exception, "Cannot output hypergraphs to " << m_hypergraphDir << " because that path exists, but is not a directory");
ofstream weightsOut;
stringstream weightsFilename;
weightsFilename << m_hypergraphDir << "/weights";
TRACE_ERR("The weights file is " << weightsFilename.str() << "\n");
weightsOut.open(weightsFilename.str().c_str());
weightsOut.setf(std::ios::fixed);
weightsOut.precision(6);
staticData.GetAllWeights().Save(weightsOut);
weightsOut.close();
}
void HypergraphOutput::Write(const Manager& manager) const {
stringstream fileName;
fileName << m_hypergraphDir << "/" << manager.GetLineNumber();
if ( m_appendSuffix ) {
fileName << "." << m_compression;
}
boost::iostreams::filtering_ostream file;
if ( m_compression == "gz" ) {
file.push( boost::iostreams::gzip_compressor() );
} else if ( m_compression == "bz2" ) {
file.push( boost::iostreams::bzip2_compressor() );
}
file.push( boost::iostreams::file_sink(fileName.str(), ios_base::out) );
if (file.is_complete() && file.good()) {
file.setf(std::ios::fixed);
file.precision(m_precision);
manager.OutputSearchGraphAsHypergraph(file);
file.flush();
} else {
TRACE_ERR("Cannot output hypergraph for line " << manager.GetLineNumber()
<< " because the output file " << fileName.str()
<< " is not open or not ready for writing"
<< std::endl);
}
file.pop();
}
}

51
moses/HypergraphOutput.h Normal file
View File

@ -0,0 +1,51 @@
// $Id$
// vim:tabstop=2
/***********************************************************************
Moses - factored phrase-based language decoder
Copyright (C) 2014- University of Edinburgh
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#ifndef moses_Hypergraph_Output_h
#define moses_Hypergraph_Output_h
/**
* Manage the output of hypergraphs.
**/
namespace Moses {
class Manager;
class HypergraphOutput {
public:
/** Initialise output directory and create weights file */
HypergraphOutput(size_t precision);
/** Write this hypergraph to file */
void Write(const Manager& manager) const;
private:
size_t m_precision;
std::string m_hypergraphDir;
std::string m_compression;
bool m_appendSuffix;
};
}
#endif

View File

@ -828,10 +828,10 @@ size_t Manager::OutputFeatureValuesForSLF(size_t index, bool zeros, const Hypoth
}
/**! Output search graph in hypergraph format of Kenneth Heafield's lazy hypergraph decoder */
void Manager::OutputSearchGraphAsHypergraph(long translationId, std::ostream &outputSearchGraphStream) const
void Manager::OutputSearchGraphAsHypergraph(std::ostream &outputSearchGraphStream) const
{
VERBOSE(2,"Getting search graph to output as hypergraph for sentence " << translationId << std::endl)
VERBOSE(2,"Getting search graph to output as hypergraph for sentence " << m_lineNumber << std::endl)
vector<SearchGraphNode> searchGraph;
GetSearchGraph(searchGraph);
@ -842,7 +842,7 @@ void Manager::OutputSearchGraphAsHypergraph(long translationId, std::ostream &ou
set<int> terminalNodes;
multimap<int,int> hypergraphIDToArcs;
VERBOSE(2,"Gathering information about search graph to output as hypergraph for sentence " << translationId << std::endl)
VERBOSE(2,"Gathering information about search graph to output as hypergraph for sentence " << m_lineNumber << std::endl)
long numNodes = 0;
long endNode = 0;
@ -904,15 +904,15 @@ void Manager::OutputSearchGraphAsHypergraph(long translationId, std::ostream &ou
// Print number of nodes and arcs
outputSearchGraphStream << numNodes << " " << numArcs << endl;
VERBOSE(2,"Search graph to output as hypergraph for sentence " << translationId
VERBOSE(2,"Search graph to output as hypergraph for sentence " << m_lineNumber
<< " contains " << numArcs << " arcs and " << numNodes << " nodes" << std::endl)
VERBOSE(2,"Outputting search graph to output as hypergraph for sentence " << translationId << std::endl)
VERBOSE(2,"Outputting search graph to output as hypergraph for sentence " << m_lineNumber << std::endl)
for (int hypergraphHypothesisID=0; hypergraphHypothesisID < endNode; hypergraphHypothesisID+=1) {
if (hypergraphHypothesisID % 100000 == 0) {
VERBOSE(2,"Processed " << hypergraphHypothesisID << " of " << numNodes << " hypergraph nodes for sentence " << translationId << std::endl);
VERBOSE(2,"Processed " << hypergraphHypothesisID << " of " << numNodes << " hypergraph nodes for sentence " << m_lineNumber << std::endl);
}
// int mosesID = hypergraphIDToMosesID[hypergraphHypothesisID];
size_t count = hypergraphIDToArcs.count(hypergraphHypothesisID);
@ -935,7 +935,7 @@ void Manager::OutputSearchGraphAsHypergraph(long translationId, std::ostream &ou
// int actualHypergraphHypothesisID = mosesIDToHypergraphID[mosesHypothesisID];
UTIL_THROW_IF2(
(hypergraphHypothesisID != mosesIDToHypergraphID[mosesHypothesisID]),
"Error while writing search lattice as hypergraph for sentence " << translationId << ". " <<
"Error while writing search lattice as hypergraph for sentence " << m_lineNumber << ". " <<
"Moses node " << mosesHypothesisID << " was expected to have hypergraph id " << hypergraphHypothesisID <<
", but actually had hypergraph id " << mosesIDToHypergraphID[mosesHypothesisID] <<
". There are " << numNodes << " nodes in the search lattice."
@ -950,7 +950,7 @@ void Manager::OutputSearchGraphAsHypergraph(long translationId, std::ostream &ou
// VERBOSE(2,"Hypergraph node " << hypergraphHypothesisID << " has parent node " << startNode << std::endl)
UTIL_THROW_IF2(
(startNode >= hypergraphHypothesisID),
"Error while writing search lattice as hypergraph for sentence" << translationId << ". " <<
"Error while writing search lattice as hypergraph for sentence" << m_lineNumber << ". " <<
"The nodes must be output in topological order. The code attempted to violate this restriction."
);

View File

@ -145,13 +145,14 @@ public:
void GetOutputLanguageModelOrder( std::ostream &out, const Hypothesis *hypo );
void GetWordGraph(long translationId, std::ostream &outputWordGraphStream) const;
int GetNextHypoId();
size_t GetLineNumber() const {return m_lineNumber;}
#ifdef HAVE_PROTOBUF
void SerializeSearchGraphPB(long translationId, std::ostream& outputStream) const;
#endif
void OutputSearchGraph(long translationId, std::ostream &outputSearchGraphStream) const;
void OutputSearchGraphAsSLF(long translationId, std::ostream &outputSearchGraphStream) const;
void OutputSearchGraphAsHypergraph(long translationId, std::ostream &outputSearchGraphStream) const;
void OutputSearchGraphAsHypergraph(std::ostream &outputSearchGraphStream) const;
void GetSearchGraph(std::vector<SearchGraphNode>& searchGraph) const;
const InputType& GetSource() const {
return m_source;