facilitate programmatic creation of word lattices

git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@3848 1f5c12ca-751b-0410-a591-d2e778427230
This commit is contained in:
redpony 2011-01-25 20:08:29 +00:00
parent 22ce1d2f19
commit eddb28e0ce
4 changed files with 30 additions and 18 deletions

View File

@ -544,10 +544,10 @@ void ARPAToSortedFiles(util::FilePiece &f, const std::vector<uint64_t> &counts,
// Only use as much buffer as we need.
size_t buffer_use = 0;
for (unsigned int order = 2; order < counts.size(); ++order) {
buffer_use = std::max(buffer_use, (sizeof(WordIndex) * order + 2 * sizeof(float)) * counts[order - 1]);
buffer_use = std::max(buffer_use, (size_t)((sizeof(WordIndex) * order + 2 * sizeof(float)) * counts[order - 1]));
}
buffer_use = std::max(buffer_use, (sizeof(WordIndex) * counts.size() + sizeof(float)) * counts.back());
buffer = std::min(buffer, buffer_use);
buffer_use = std::max(buffer_use, (size_t)((sizeof(WordIndex) * counts.size() + sizeof(float)) * counts.back()));
buffer = std::min((size_t)buffer, buffer_use);
util::scoped_memory mem;
mem.reset(malloc(buffer), buffer, util::scoped_memory::MALLOC_ALLOCATED);

View File

@ -28,7 +28,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include <cstdlib>
/** A couple of utilities to read .pcn files. A python-compatible format
* for encoding confusion networks.
* for encoding confusion networks and word lattices.
*/
namespace PCN {
@ -36,8 +36,8 @@ namespace PCN {
typedef std::vector<CNAlt> CNCol;
typedef std::vector<CNCol> CN;
/** Given a string ((('foo',0.1),('bar',0.9)),...) representation of a
* confusion net in PCN format, return a CN object
/** Given a string ((('foo',0.1,1),('bar',0.9,2)),...) representation of a
* word lattice in PCN format, return a CN object representing the lattice
*/
CN parsePCN(const std::string& in);

View File

@ -30,40 +30,33 @@ void WordLattice::Print(std::ostream& out) const {
out<<"\n\n";
}
int WordLattice::Read(std::istream& in,const std::vector<FactorType>& factorOrder)
int WordLattice::InitializeFromPCNDataType(const PCN::CN& cn, const std::vector<FactorType>& factorOrder, const std::string& debug_line)
{
Clear();
std::string line;
if(!getline(in,line)) return 0;
std::map<std::string, std::string> meta=ProcessAndStripSGML(line);
if (meta.find("id") != meta.end()) { this->SetTranslationId(atol(meta["id"].c_str())); }
size_t numLinkParams = StaticData::Instance().GetNumLinkParams();
size_t numLinkWeights = StaticData::Instance().GetNumInputScores();
size_t maxSizePhrase = StaticData::Instance().GetMaxPhraseLength();
//when we have one more weight than params, we add a word count feature
bool addRealWordCount = ((numLinkParams + 1) == numLinkWeights);
PCN::CN cn = PCN::parsePCN(line);
data.resize(cn.size());
next_nodes.resize(cn.size());
for(size_t i=0;i<cn.size();++i) {
PCN::CNCol& col = cn[i];
const PCN::CNCol& col = cn[i];
if (col.empty()) return false;
data[i].resize(col.size());
next_nodes[i].resize(col.size());
for (size_t j=0;j<col.size();++j) {
PCN::CNAlt& alt = col[j];
const PCN::CNAlt& alt = col[j];
//check for correct number of link parameters
if (alt.first.second.size() != numLinkParams) {
TRACE_ERR("ERROR: need " << numLinkParams << " link parameters, found " << alt.first.second.size() << " while reading column " << i << " from " << line << "\n");
TRACE_ERR("ERROR: need " << numLinkParams << " link parameters, found " << alt.first.second.size() << " while reading column " << i << " from " << debug_line << "\n");
return false;
}
//check each element for bounds
std::vector<float>::iterator probsIterator;
std::vector<float>::const_iterator probsIterator;
data[i][j].second = std::vector<float>(0);
for(probsIterator = alt.first.second.begin(); probsIterator < alt.first.second.end(); probsIterator++) {
IFVERBOSE(1) {
@ -114,6 +107,18 @@ int WordLattice::Read(std::istream& in,const std::vector<FactorType>& factorOrde
return !cn.empty();
}
int WordLattice::Read(std::istream& in,const std::vector<FactorType>& factorOrder)
{
Clear();
std::string line;
if(!getline(in,line)) return 0;
std::map<std::string, std::string> meta=ProcessAndStripSGML(line);
if (meta.find("id") != meta.end()) { this->SetTranslationId(atol(meta["id"].c_str())); }
PCN::CN cn = PCN::parsePCN(line);
return InitializeFromPCNDataType(cn, factorOrder, line);
}
void WordLattice::GetAsEdgeMatrix(std::vector<std::vector<bool> >& edges) const
{
edges.resize(data.size()+1,std::vector<bool>(data.size()+1, false));

View File

@ -3,6 +3,7 @@
#include <vector>
#include "ConfusionNet.h"
#include "PCNTools.h"
namespace Moses
{
@ -23,6 +24,12 @@ public:
// is it possible to get from the edge of the previous word range to the current word range
virtual bool CanIGetFromAToB(size_t start, size_t end) const;
/** Given a lattice represented using the PCN::CN data type (topologically sorted agency list
* representation), initialize the WordLattice object
*/
int InitializeFromPCNDataType(const PCN::CN& cn, const std::vector<FactorType>& factorOrder, const std::string& debug_line = "");
/** Read from PLF format (1 lattice per line)
*/
int Read(std::istream& in,const std::vector<FactorType>& factorOrder);
/** Convert internal representation into an edge matrix