mosesdecoder/mert/Hypergraph.cpp
2014-08-12 11:33:37 +01:00

314 lines
12 KiB
C++

/***********************************************************************
Moses - factored phrase-based language decoder
Copyright (C) 2014- University of Edinburgh
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#include <iostream>
#include <set>
#include <boost/lexical_cast.hpp>
#include "util/double-conversion/double-conversion.h"
#include "util/string_piece.hh"
#include "util/tokenize_piece.hh"
#include "Hypergraph.h"
using namespace std;
static const string kBOS = "<s>";
static const string kEOS = "</s>";
namespace MosesTuning {
StringPiece NextLine(util::FilePiece& from) {
StringPiece line;
while ((line = from.ReadLine()).starts_with("#"));
return line;
}
Vocab::Vocab() : eos_( FindOrAdd(kEOS)), bos_(FindOrAdd(kBOS)){
}
const Vocab::Entry &Vocab::FindOrAdd(const StringPiece &str) {
#if BOOST_VERSION >= 104200
Map::const_iterator i= map_.find(str, Hash(), Equals());
#else
std::string copied_str(str.data(), str.size());
Map::const_iterator i = map_.find(copied_str.c_str());
#endif
if (i != map_.end()) return *i;
char *copied = static_cast<char*>(piece_backing_.Allocate(str.size() + 1));
memcpy(copied, str.data(), str.size());
copied[str.size()] = 0;
return *map_.insert(Entry(copied, map_.size())).first;
}
double_conversion::StringToDoubleConverter converter(double_conversion::StringToDoubleConverter::NO_FLAGS, NAN, NAN, "inf", "nan");
/**
* Reads an incoming edge. Returns edge and source words covered.
**/
static pair<Edge*,size_t> ReadEdge(util::FilePiece &from, Graph &graph) {
Edge* edge = graph.NewEdge();
StringPiece line = from.ReadLine(); //Don't allow comments within edge lists
util::TokenIter<util::MultiCharacter> pipes(line, util::MultiCharacter(" ||| "));
//Target
for (util::TokenIter<util::SingleCharacter, true> i(*pipes, util::SingleCharacter(' ')); i; ++i) {
StringPiece got = *i;
if ('[' == *got.data() && ']' == got.data()[got.size() - 1]) {
// non-terminal
char *end_ptr;
unsigned long int child = std::strtoul(got.data() + 1, &end_ptr, 10);
UTIL_THROW_IF(end_ptr != got.data() + got.size() - 1, HypergraphException, "Bad non-terminal" << got);
UTIL_THROW_IF(child >= graph.VertexSize(), HypergraphException, "Reference to vertex " << child << " but we only have " << graph.VertexSize() << " vertices. Is the file in bottom-up format?");
edge->AddWord(NULL);
edge->AddChild(child);
} else {
const Vocab::Entry &found = graph.MutableVocab().FindOrAdd(got);
edge->AddWord(&found);
}
}
//Features
++pipes;
for (util::TokenIter<util::SingleCharacter, true> i(*pipes, util::SingleCharacter(' ')); i; ++i) {
StringPiece fv = *i;
if (!fv.size()) break;
size_t equals = fv.find_last_of("=");
UTIL_THROW_IF(equals == fv.npos, HypergraphException, "Failed to parse feature '" << fv << "'");
StringPiece name = fv.substr(0,equals);
StringPiece value = fv.substr(equals+1);
int processed;
float score = converter.StringToFloat(value.data(), value.length(), &processed);
UTIL_THROW_IF(isnan(score), HypergraphException, "Failed to parse weight '" << value << "'");
edge->AddFeature(name,score);
}
//Covered words
++pipes;
size_t sourceCovered = boost::lexical_cast<size_t>(*pipes);
return pair<Edge*,size_t>(edge,sourceCovered);
}
void Graph::Prune(Graph* pNewGraph, const SparseVector& weights, size_t minEdgeCount) const {
Graph& newGraph = *pNewGraph;
//TODO: Optimise case where no pruning required
//For debug
/*
map<const Edge*, string> edgeIds;
for (size_t i = 0; i < edges_.Size(); ++i) {
stringstream str;
size_t childId = 0;
for (size_t j = 0; j < edges_[i].Words().size(); ++j) {
if (edges_[i].Words()[j]) {
str << edges_[i].Words()[j]->first << " ";
} else {
str << "[" << edges_[i].Children()[childId++] << "] ";
}
}
edgeIds[&(edges_[i])] = str.str();
}
*/
//end For debug
map<const Edge*, FeatureStatsType> edgeBackwardScores;
map<const Edge*, size_t> edgeHeads;
vector<FeatureStatsType> vertexBackwardScores(vertices_.Size(), kMinScore);
vector<vector<const Edge*> > outgoing(vertices_.Size());
//Compute backward scores
for (size_t vi = 0; vi < vertices_.Size(); ++vi) {
// cerr << "Vertex " << vi << endl;
const Vertex& vertex = vertices_[vi];
const vector<const Edge*>& incoming = vertex.GetIncoming();
if (!incoming.size()) {
vertexBackwardScores[vi] = 0;
} else {
for (size_t ei = 0; ei < incoming.size(); ++ei) {
//cerr << "Edge " << edgeIds[incoming[ei]] << endl;
edgeHeads[incoming[ei]]= vi;
FeatureStatsType incomingScore = incoming[ei]->GetScore(weights);
for (size_t i = 0; i < incoming[ei]->Children().size(); ++i) {
//cerr << "\tChild " << incoming[ei]->Children()[i] << endl;
size_t childId = incoming[ei]->Children()[i];
UTIL_THROW_IF(vertexBackwardScores[childId] == kMinScore,
HypergraphException, "Graph was not topologically sorted. curr=" << vi << " prev=" << childId);
outgoing[childId].push_back(incoming[ei]);
incomingScore += vertexBackwardScores[childId];
}
edgeBackwardScores[incoming[ei]]= incomingScore;
//cerr << "Backward score: " << incomingScore << endl;
if (incomingScore > vertexBackwardScores[vi]) vertexBackwardScores[vi] = incomingScore;
}
}
}
//Compute forward scores
vector<FeatureStatsType> vertexForwardScores(vertices_.Size(), kMinScore);
map<const Edge*, FeatureStatsType> edgeForwardScores;
for (size_t i = 1; i <= vertices_.Size(); ++i) {
size_t vi = vertices_.Size() - i;
//cerr << "Vertex " << vi << endl;
if (!outgoing[vi].size()) {
vertexForwardScores[vi] = 0;
} else {
for (size_t ei = 0; ei < outgoing[vi].size(); ++ei) {
//cerr << "Edge " << edgeIds[outgoing[vi][ei]] << endl;
FeatureStatsType outgoingScore = 0;
//add score of head
outgoingScore += vertexForwardScores[edgeHeads[outgoing[vi][ei]]];
//cerr << "Forward score " << outgoingScore << endl;
edgeForwardScores[outgoing[vi][ei]] = outgoingScore;
//sum scores of siblings
for (size_t i = 0; i < outgoing[vi][ei]->Children().size(); ++i) {
size_t siblingId = outgoing[vi][ei]->Children()[i];
if (siblingId != vi) {
//cerr << "\tSibling " << siblingId << endl;
outgoingScore += vertexBackwardScores[siblingId];
}
}
outgoingScore += outgoing[vi][ei]->GetScore(weights);
if (outgoingScore > vertexForwardScores[vi]) vertexForwardScores[vi] = outgoingScore;
//cerr << "Vertex " << vi << " forward score " << outgoingScore << endl;
}
}
}
multimap<FeatureStatsType, const Edge*> edgeScores;
for (size_t i = 0; i < edges_.Size(); ++i) {
const Edge* edge = &(edges_[i]);
if (edgeForwardScores.find(edge) == edgeForwardScores.end()) {
//This edge has no children, so didn't get a forward score. Its forward score
//is that of its head
edgeForwardScores[edge] = vertexForwardScores[edgeHeads[edge]];
}
FeatureStatsType score = edgeForwardScores[edge] + edgeBackwardScores[edge];
edgeScores.insert(pair<FeatureStatsType, const Edge*>(score,edge));
// cerr << edgeIds[edge] << " " << score << endl;
}
multimap<FeatureStatsType, const Edge*>::const_reverse_iterator ei = edgeScores.rbegin();
size_t edgeCount = 1;
while(edgeCount < minEdgeCount && ei != edgeScores.rend()) {
++ei;
++edgeCount;
}
multimap<FeatureStatsType, const Edge*>::const_iterator lowest = edgeScores.begin();
if (ei != edgeScores.rend()) lowest = edgeScores.lower_bound(ei->first);
//cerr << "Retained edges" << endl;
set<size_t> retainedVertices;
set<const Edge*> retainedEdges;
for (; lowest != edgeScores.end(); ++lowest) {
//cerr << lowest->first << " " << edgeIds[lowest->second] << endl;
retainedEdges.insert(lowest->second);
retainedVertices.insert(edgeHeads[lowest->second]);
for (size_t i = 0; i < lowest->second->Children().size(); ++i) {
retainedVertices.insert(lowest->second->Children()[i]);
}
}
newGraph.SetCounts(retainedVertices.size(), retainedEdges.size());
//cerr << "Retained vertices" << endl;
map<size_t,size_t> oldIdToNew;
size_t vi = 0;
for (set<size_t>::const_iterator i = retainedVertices.begin(); i != retainedVertices.end(); ++i, ++vi) {
//cerr << *i << " New: " << vi << endl;
oldIdToNew[*i] = vi;
Vertex* vertex = newGraph.NewVertex();
vertex->SetSourceCovered(vertices_[*i].SourceCovered());
}
for (set<const Edge*>::const_iterator i = retainedEdges.begin(); i != retainedEdges.end(); ++i) {
Edge* newEdge = newGraph.NewEdge();
const Edge* oldEdge = *i;
for (size_t j = 0; j < oldEdge->Words().size(); ++j) {
newEdge->AddWord(oldEdge->Words()[j]);
}
for (size_t j = 0; j < oldEdge->Children().size(); ++j) {
newEdge->AddChild(oldIdToNew[oldEdge->Children()[j]]);
}
newEdge->SetFeatures(oldEdge->Features());
Vertex& newHead = newGraph.vertices_[oldIdToNew[edgeHeads[oldEdge]]];
newHead.AddEdge(newEdge);
}
/*
cerr << "New graph" << endl;
for (size_t vi = 0; vi < newGraph.VertexSize(); ++vi) {
cerr << "Vertex " << vi << endl;
const vector<const Edge*> incoming = newGraph.GetVertex(vi).GetIncoming();
for (size_t ei = 0; ei < incoming.size(); ++ei) {
size_t childId = 0;
for (size_t wi = 0; wi < incoming[ei]->Words().size(); ++wi) {
const Vocab::Entry* word = incoming[ei]->Words()[wi];
if (word) {
cerr << word->first << " ";
} else {
cerr << "[" << incoming[ei]->Children()[childId++] << "] ";
}
}
cerr << " Score: " << incoming[ei]->GetScore(weights) << endl;
}
cerr << endl;
}
*/
}
/**
* Read from "Kenneth's hypergraph" aka cdec target_graph format (with comments)
**/
void ReadGraph(util::FilePiece &from, Graph &graph) {
//First line should contain field names
StringPiece line = from.ReadLine();
UTIL_THROW_IF(line.compare("# target ||| features ||| source-covered") != 0, HypergraphException, "Incorrect format spec on first line: '" << line << "'");
line = NextLine(from);
//Then expect numbers of vertices
util::TokenIter<util::SingleCharacter, false> i(line, util::SingleCharacter(' '));
unsigned long int vertices = boost::lexical_cast<unsigned long int>(*i);
++i;
unsigned long int edges = boost::lexical_cast<unsigned long int>(*i);
graph.SetCounts(vertices, edges);
//cerr << "vertices: " << vertices << "; edges: " << edges << endl;
for (size_t i = 0; i < vertices; ++i) {
line = NextLine(from);
unsigned long int edge_count = boost::lexical_cast<unsigned long int>(line);
Vertex* vertex = graph.NewVertex();
for (unsigned long int e = 0; e < edge_count; ++e) {
pair<Edge*,size_t> edge = ReadEdge(from, graph);
vertex->AddEdge(edge.first);
//Note: the file format attaches this to the edge, but it's really a property
//of the vertex.
if (!e) {vertex->SetSourceCovered(edge.second);}
}
}
}
};