Support storing coordinates of target phrase

- Keep track of named spaces in StaticData
- Adding coords to phrases implemented for Mmsapt
This commit is contained in:
Michael Denkowski 2016-08-08 16:33:24 +02:00
parent d29916bbb3
commit ae1e51d81a
9 changed files with 95 additions and 52 deletions

View File

@ -68,13 +68,8 @@ public:
size_t m_frontSpanCoveredLength;
// how many words from the beginning are covered
// Coordinates in user-defined spaces, indexed by phrase dictionary pointer
// Looking up PD* returns a vector of the input's coordinates in each space
// known to the PD, in order (vector of pointers to float vectors). This
// allows different models to use different subsets of all named spaces.
typedef std::vector<boost::shared_ptr<std::vector<float> > > INCOORD;
typedef std::map<PhraseDictionary const*, INCOORD> PD2IC;
boost::shared_ptr<PD2IC> m_pd2InputCoord;
// Coordinates in user-defined spaces (see "coord" XML tag)
SPTR<std::map<size_t const, std::vector<float> > > m_coordMap;
InputType(AllOptions::ptr const& opts, long translationId = 0);
virtual ~InputType();

View File

@ -936,4 +936,25 @@ void StaticData::ResetWeights(const std::string &denseWeights, const std::string
}
}
size_t StaticData::GetCoordSpace(string space) const
{
map<string, size_t>::const_iterator m = m_coordSpaceMap.find(space);
if(m == m_coordSpaceMap.end()) {
return 0;
}
return m->second;
}
size_t StaticData::MapCoordSpace(string space)
{
map<string, size_t>::const_iterator m = m_coordSpaceMap.find(space);
if (m != m_coordSpaceMap.end()) {
return m->second;
}
size_t id = m_coordSpaceNextID;
m_coordSpaceNextID += 1;
m_coordSpaceMap[space] = id;
return id;
}
} // namespace

View File

@ -60,7 +60,7 @@ class PhraseDictionaryDynamicCacheBased;
typedef std::pair<std::string, float> UnknownLHSEntry;
typedef std::vector<UnknownLHSEntry> UnknownLHSList;
/** Contains global variables and contants.
/** Contains global variables and constants.
* Only 1 object of this class should be instantiated.
* A const object of this class is accessible by any function during decoding by calling StaticData::Instance();
*/
@ -152,6 +152,12 @@ protected:
bool ini_performance_options();
void initialize_features();
// Coordinate space name map for matching spaces across XML input ("coord"
// tag) and feature functions that assign or use coordinates on target phrases
std::map< std::string const, size_t > m_coordSpaceMap;
size_t m_coordSpaceNextID = 1;
public:
//! destructor
@ -394,6 +400,9 @@ public:
return m_requireSortingAfterSourceContext;
}
// Coordinate spaces
size_t GetCoordSpace(std::string space) const;
size_t MapCoordSpace(std::string space);
};
}

View File

@ -333,6 +333,29 @@ SetExtraScores(FeatureFunction const* ff,
m_cached_scores[ff] = s;
}
vector<vector<float> const*> const&
TargetPhrase::
GetCoordList(size_t const spaceID) const
{
UTIL_THROW_IF2(!m_cached_coord,
"No coordinates known for target phrase");
CoordCache_t::const_iterator m = m_cached_coord->find(spaceID);
UTIL_THROW_IF2(m == m_cached_coord->end(),
"No coordinates known in given space for target phrase");
return m->second;
}
void
TargetPhrase::
PushCoord(size_t const spaceID,
vector<float> const* coord)
{
if (!m_cached_coord) {
m_cached_coord.reset(new CoordCache_t);
}
vector<vector<float> const *>& coordList = (*m_cached_coord)[spaceID];
coordList.push_back(coord);
}
void TargetPhrase::SetProperties(const StringPiece &str)
{

View File

@ -56,9 +56,16 @@ public:
Scores const* GetExtraScores(FeatureFunction const* ff) const;
void SetExtraScores(FeatureFunction const* ff,boost::shared_ptr<Scores> const& scores);
typedef std::map<size_t const, std::vector<std::vector<float> const*> > CoordCache_t;
std::vector<std::vector<float> const*> const& GetCoordList(size_t const spaceID) const;
void PushCoord(size_t const spaceID, std::vector<float> const* coord);
private:
ScoreCache_t m_cached_scores;
// The coordinate cache stores vectors of pointers to vectors. The coordinate
// vectors referenced by the pointers should be owned by the phrase dictionary
// implementation.
SPTR<CoordCache_t> m_cached_coord;
WPTR<ContextScope> m_scope;
private:

View File

@ -147,14 +147,6 @@ public:
void SetParameter(const std::string& key, const std::string& value);
void AddKnownSpace(const std::string& name) {
m_knownSpaces.push_back(name);
}
const std::vector<std::string> &GetKnownSpaces() const {
return m_knownSpaces;
}
// LEGACY
//! find list of translations that can translates a portion of src. Used by confusion network decoding
virtual
@ -179,9 +171,6 @@ protected:
// cache
size_t m_maxCacheSize; // 0 = no caching
// Named coordinate spaces used by this model, in order (see "coord" XML tag)
std::vector<std::string> m_knownSpaces;
#ifdef WITH_THREADS
//reader-writer lock
mutable boost::thread_specific_ptr<CacheColl> m_cache;

View File

@ -286,16 +286,17 @@ namespace Moses
BOOST_FOREACH(std::string instance, coord_instances)
{
vector<string> toks = Moses::Tokenize(instance, ":");
string name = toks[0];
string space = toks[0];
string file = toks[1];
//TODO: register this space for this model
// Register that this model uses the given space
m_coord_spaces.push_back(StaticData::InstanceNonConst().MapCoordSpace(space));
// Load sid coordinates from file
m_sid_coord_list.push_back(vector<vector<float> >());
vector<vector<float> >& sid_coord = m_sid_coord_list[m_sid_coord_list.size() - 1];
//TODO: support extra data for btdyn, here? extra?
sid_coord.reserve(btfix->T1->size());
string line;
cerr << "Loading coordinate lines for space \"" << name << "\" from " << file << endl;
cerr << "Loading coordinate lines for space \"" << space << "\" from " << file << endl;
iostreams::filtering_istream in;
ugdiss::open_input_stream(file, in);
while(getline(in, line))
@ -648,19 +649,27 @@ namespace Moses
}
#endif
// Track stats for rescoring non-cacheable phrases as needed
// Track coordinates if requested
if (m_track_coord)
{
cerr << btfix->toString(pool.p1, 0) << " ::: " << btfix->toString(pool.p2, 1) << endl;
BOOST_FOREACH(uint32_t const sid, *pool.sids)
{
BOOST_FOREACH(vector<vector<float> > coord, m_sid_coord_list)
for(size_t i = 0; i < m_coord_spaces.size(); ++i)
{
//TODO: store coord[sid] in tp
cerr << " : " << Join(" ", coord[sid]);
tp->PushCoord(m_coord_spaces[i], &m_sid_coord_list[i][sid]);
}
cerr << endl;
}
/*
cerr << btfix->toString(pool.p1, 0) << " ::: " << btfix->toString(pool.p2, 1);
BOOST_FOREACH(size_t id, m_coord_spaces)
{
cerr << " [" << id << "]";
vector<vector<float> const*> const& coordList = tp->GetCoordList(id);
BOOST_FOREACH(vector<float> const* coord, coordList)
cerr << " : " << Join(" ", *coord);
}
cerr << endl;
*/
}
return tp;

View File

@ -119,8 +119,10 @@ namespace Moses
std::vector<SPTR<pscorer > > m_active_ff_common;
// activated feature functions (dyn)
bool m_track_coord = false; // track coordinates? Effectively: track sids when sampling bitext?
bool m_track_coord = false; // track coordinates? Track sids when sampling
// from bitext, append coords to target phrases
std::vector<std::vector<std::vector<float> > > m_sid_coord_list;
std::vector<size_t> m_coord_spaces;
void
parse_factor_spec(std::vector<FactorType>& flist, std::string const key);

View File

@ -405,33 +405,21 @@ ProcessAndStripXMLTags(AllOptions const& opts, string &line,
// Coord: coordinates of the input sentence in a user-defined space
// <coord space="NAME" coord="X Y Z ..." />
// where NAME is the name of the space and X Y Z ... are floats. See
// PScoreDist in PhraseDictionaryBitextSampling (Mmsapt) for an example
// of using this information for feature scoring.
// TODO for an example of using this information for feature scoring.
else if (tagName == "coord") {
// Parse tag
string space = ParseXmlTagAttribute(tagContent, "space");
vector<string> toks = Tokenize(ParseXmlTagAttribute(tagContent, "coord"));
boost::shared_ptr<vector<float> > coord(new vector<float>);
Scan<float>(*coord, toks);
// Init if needed
if (!input.m_pd2InputCoord) {
input.m_pd2InputCoord.reset(new std::map<PhraseDictionary const*, std::vector<boost::shared_ptr<std::vector<float> > > >);
}
// Scan phrase dictionaries to see which (if any) use this space
BOOST_FOREACH(PhraseDictionary const* pd, PhraseDictionary::GetColl()) {
const vector<string>& pdKnownSpaces = pd->GetKnownSpaces();
for (size_t i = 0; i < pdKnownSpaces.size(); ++i) {
// Match
if (pdKnownSpaces[i] == space) {
// Make sure a slot to store the coordinates exists
std::vector<boost::shared_ptr<std::vector<float> > >& inputCoord = (*input.m_pd2InputCoord)[pd];
if (inputCoord.size() < i + 1) {
inputCoord.resize(i + 1);
}
// Store
inputCoord[i] = coord;
}
vector<string> tok = Tokenize(ParseXmlTagAttribute(tagContent, "coord"));
size_t id = StaticData::Instance().GetCoordSpace(space);
if (!id) {
TRACE_ERR("ERROR: no models use space " << space << ", will be ignored" << endl);
} else {
// Init if needed
if (!input.m_coordMap) {
input.m_coordMap.reset(new std::map<size_t const, std::vector<float> >);
}
vector<float>& coord = (*input.m_coordMap)[id];
Scan<float>(coord, tok);
}
}