2010-07-29 01:28:14 +04:00
|
|
|
/*
|
|
|
|
* PhraseAlignment.cpp
|
|
|
|
* extract
|
|
|
|
*
|
|
|
|
* Created by Hieu Hoang on 28/07/2010.
|
|
|
|
* Copyright 2010 __MyCompanyName__. All rights reserved.
|
|
|
|
*
|
|
|
|
*/
|
|
|
|
|
2011-09-14 14:23:14 +04:00
|
|
|
#include <sstream>
|
2010-07-29 01:28:14 +04:00
|
|
|
#include "PhraseAlignment.h"
|
|
|
|
#include "SafeGetline.h"
|
|
|
|
#include "tables-core.h"
|
|
|
|
#include "score.h"
|
|
|
|
|
|
|
|
using namespace std;
|
|
|
|
|
|
|
|
extern Vocabulary vcbT;
|
|
|
|
extern Vocabulary vcbS;
|
|
|
|
|
|
|
|
extern bool hierarchicalFlag;
|
|
|
|
|
2011-09-14 14:23:14 +04:00
|
|
|
//! convert string to variable of type T. Used to reading floats, int etc from files
|
|
|
|
template<typename T>
|
|
|
|
inline T Scan(const std::string &input)
|
|
|
|
{
|
|
|
|
std::stringstream stream(input);
|
|
|
|
T ret;
|
|
|
|
stream >> ret;
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
//! speeded up version of above
|
|
|
|
template<typename T>
|
|
|
|
inline void Scan(std::vector<T> &output, const std::vector< std::string > &input)
|
|
|
|
{
|
|
|
|
output.resize(input.size());
|
|
|
|
for (size_t i = 0 ; i < input.size() ; i++)
|
|
|
|
{
|
|
|
|
output[i] = Scan<T>( input[i] );
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
inline void Tokenize(std::vector<std::string> &output
|
|
|
|
, const std::string& str
|
|
|
|
, const std::string& delimiters = " \t")
|
|
|
|
{
|
|
|
|
// Skip delimiters at beginning.
|
|
|
|
std::string::size_type lastPos = str.find_first_not_of(delimiters, 0);
|
|
|
|
// Find first "non-delimiter".
|
|
|
|
std::string::size_type pos = str.find_first_of(delimiters, lastPos);
|
|
|
|
|
|
|
|
while (std::string::npos != pos || std::string::npos != lastPos) {
|
|
|
|
// Found a token, add it to the vector.
|
|
|
|
output.push_back(str.substr(lastPos, pos - lastPos));
|
|
|
|
// Skip delimiters. Note the "not_of"
|
|
|
|
lastPos = str.find_first_not_of(delimiters, pos);
|
|
|
|
// Find next "non-delimiter"
|
|
|
|
pos = str.find_first_of(delimiters, lastPos);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// speeded up version of above
|
|
|
|
template<typename T>
|
|
|
|
inline void Tokenize( std::vector<T> &output
|
|
|
|
, const std::string &input
|
|
|
|
, const std::string& delimiters = " \t")
|
|
|
|
{
|
|
|
|
std::vector<std::string> stringVector;
|
|
|
|
Tokenize(stringVector, input, delimiters);
|
|
|
|
return Scan<T>(output, stringVector );
|
|
|
|
}
|
|
|
|
|
2010-07-29 01:28:14 +04:00
|
|
|
// read in a phrase pair and store it
|
2011-02-24 16:57:11 +03:00
|
|
|
void PhraseAlignment::create( char line[], int lineID )
|
2010-07-29 01:28:14 +04:00
|
|
|
{
|
2011-02-24 16:57:11 +03:00
|
|
|
assert(phraseS.empty());
|
|
|
|
assert(phraseT.empty());
|
|
|
|
|
|
|
|
//cerr << "processing " << line;
|
|
|
|
vector< string > token = tokenize( line );
|
|
|
|
int item = 1;
|
2012-05-10 16:48:51 +04:00
|
|
|
for (size_t j=0; j<token.size(); j++) {
|
2011-02-24 16:57:11 +03:00
|
|
|
if (token[j] == "|||") item++;
|
|
|
|
else if (item == 1) { // source phrase
|
|
|
|
phraseS.push_back( vcbS.storeIfNew( token[j] ) );
|
|
|
|
}
|
|
|
|
|
|
|
|
else if (item == 2) { // target phrase
|
|
|
|
phraseT.push_back( vcbT.storeIfNew( token[j] ) );
|
|
|
|
}
|
|
|
|
else if (item == 3) { // alignment
|
|
|
|
int s,t;
|
|
|
|
sscanf(token[j].c_str(), "%d-%d", &s, &t);
|
2012-05-10 16:48:51 +04:00
|
|
|
if ((size_t)t >= phraseT.size() || (size_t)s >= phraseS.size()) {
|
2011-02-24 16:57:11 +03:00
|
|
|
cerr << "WARNING: phrase pair " << lineID
|
|
|
|
<< " has alignment point (" << s << ", " << t
|
|
|
|
<< ") out of bounds (" << phraseS.size() << ", " << phraseT.size() << ")\n";
|
|
|
|
} else {
|
|
|
|
// first alignment point? -> initialize
|
|
|
|
createAlignVec(phraseS.size(), phraseT.size());
|
|
|
|
|
|
|
|
// add alignment point
|
|
|
|
alignedToT[t].insert( s );
|
|
|
|
alignedToS[s].insert( t );
|
|
|
|
}
|
|
|
|
} else if (item == 4) { // count
|
|
|
|
sscanf(token[j].c_str(), "%f", &count);
|
|
|
|
}
|
2011-09-14 14:23:14 +04:00
|
|
|
else if (item == 5) { // non-term lengths
|
|
|
|
addNTLength(token[j]);
|
|
|
|
}
|
2011-02-24 16:57:11 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
createAlignVec(phraseS.size(), phraseT.size());
|
|
|
|
|
|
|
|
if (item == 3) {
|
|
|
|
count = 1.0;
|
|
|
|
}
|
2011-09-14 14:23:14 +04:00
|
|
|
if (item < 3 || item > 5) {
|
2011-02-24 16:57:11 +03:00
|
|
|
cerr << "ERROR: faulty line " << lineID << ": " << line << endl;
|
|
|
|
}
|
2010-07-29 01:28:14 +04:00
|
|
|
}
|
|
|
|
|
2011-09-14 14:23:14 +04:00
|
|
|
void PhraseAlignment::addNTLength(const std::string &tok)
|
|
|
|
{
|
|
|
|
vector< string > tokens;
|
|
|
|
|
|
|
|
Tokenize(tokens, tok, "=");
|
|
|
|
assert(tokens.size() == 2);
|
|
|
|
|
|
|
|
size_t sourcePos = Scan<size_t>(tokens[0]);
|
|
|
|
assert(sourcePos < phraseS.size());
|
|
|
|
|
|
|
|
vector< size_t > ntLengths;
|
|
|
|
Tokenize<size_t>(ntLengths, tokens[1], ",");
|
|
|
|
assert(ntLengths.size() == 2);
|
|
|
|
|
|
|
|
m_ntLengths[sourcePos] = std::pair<size_t, size_t>(ntLengths[0], ntLengths[1]);
|
|
|
|
}
|
|
|
|
|
2010-07-29 02:49:37 +04:00
|
|
|
void PhraseAlignment::createAlignVec(size_t sourceSize, size_t targetSize)
|
|
|
|
{
|
2011-02-24 16:57:11 +03:00
|
|
|
// in case of no align info. always need align info, even if blank
|
|
|
|
if (alignedToT.size() == 0) {
|
|
|
|
size_t numTgtSymbols = (hierarchicalFlag ? targetSize-1 : targetSize);
|
|
|
|
alignedToT.resize(numTgtSymbols);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (alignedToS.size() == 0) {
|
|
|
|
size_t numSrcSymbols = (hierarchicalFlag ? sourceSize-1 : sourceSize);
|
|
|
|
alignedToS.resize(numSrcSymbols);
|
|
|
|
}
|
2010-07-29 02:49:37 +04:00
|
|
|
}
|
|
|
|
|
2011-02-24 16:57:11 +03:00
|
|
|
void PhraseAlignment::clear()
|
2010-07-29 02:49:37 +04:00
|
|
|
{
|
2010-12-15 02:49:57 +03:00
|
|
|
phraseS.clear();
|
|
|
|
phraseT.clear();
|
2010-07-29 01:28:14 +04:00
|
|
|
alignedToT.clear();
|
|
|
|
alignedToS.clear();
|
|
|
|
}
|
|
|
|
|
|
|
|
// check if two word alignments between a phrase pair are the same
|
2011-02-24 16:57:11 +03:00
|
|
|
bool PhraseAlignment::equals( const PhraseAlignment& other )
|
2010-07-29 01:28:14 +04:00
|
|
|
{
|
2011-02-24 16:57:11 +03:00
|
|
|
if (this == &other) return true;
|
|
|
|
if (other.GetTarget() != GetTarget()) return false;
|
|
|
|
if (other.GetSource() != GetSource()) return false;
|
|
|
|
if (other.alignedToT != alignedToT) return false;
|
|
|
|
if (other.alignedToS != alignedToS) return false;
|
|
|
|
return true;
|
2010-07-29 01:28:14 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
// check if two word alignments between a phrase pairs "match"
|
|
|
|
// i.e. they do not differ in the alignment of non-termimals
|
|
|
|
bool PhraseAlignment::match( const PhraseAlignment& other )
|
|
|
|
{
|
2011-02-24 16:57:11 +03:00
|
|
|
if (this == &other) return true;
|
|
|
|
if (other.GetTarget() != GetTarget()) return false;
|
|
|
|
if (other.GetSource() != GetSource()) return false;
|
|
|
|
if (!hierarchicalFlag) return true;
|
|
|
|
|
2010-07-29 01:28:14 +04:00
|
|
|
assert(phraseT.size() == alignedToT.size() + 1);
|
|
|
|
assert(alignedToT.size() == other.alignedToT.size());
|
2011-02-24 16:57:11 +03:00
|
|
|
|
|
|
|
// loop over all words (note: 0 = left hand side of rule)
|
2012-05-10 16:48:51 +04:00
|
|
|
for(size_t i=0; i<phraseT.size()-1; i++) {
|
2011-02-24 16:57:11 +03:00
|
|
|
if (isNonTerminal( vcbT.getWord( phraseT[i] ) )) {
|
|
|
|
if (alignedToT[i].size() != 1 ||
|
|
|
|
other.alignedToT[i].size() != 1 ||
|
|
|
|
*(alignedToT[i].begin()) != *(other.alignedToT[i].begin()))
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return true;
|
2010-07-29 01:28:14 +04:00
|
|
|
}
|
|
|
|
|
2011-07-27 11:55:03 +04:00
|
|
|
int PhraseAlignment::Compare(const PhraseAlignment &other) const
|
|
|
|
{
|
|
|
|
if (this == &other) // comparing with itself
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
if (GetTarget() != other.GetTarget())
|
|
|
|
return ( GetTarget() < other.GetTarget() ) ? -1 : +1;
|
|
|
|
|
|
|
|
if (GetSource() != other.GetSource())
|
|
|
|
return ( GetSource() < other.GetSource() ) ? -1 : +1;
|
|
|
|
|
2011-07-27 13:40:58 +04:00
|
|
|
if (!hierarchicalFlag)
|
|
|
|
return 0;
|
2011-07-27 11:55:03 +04:00
|
|
|
|
2011-07-27 13:40:58 +04:00
|
|
|
// loop over all words (note: 0 = left hand side of rule)
|
2012-05-10 16:48:51 +04:00
|
|
|
for(size_t i=0; i<phraseT.size()-1; i++) {
|
2011-07-27 13:40:58 +04:00
|
|
|
if (isNonTerminal( vcbT.getWord( phraseT[i] ) )) {
|
|
|
|
size_t thisAlign = *(alignedToT[i].begin());
|
|
|
|
size_t otherAlign = *(other.alignedToT[i].begin());
|
2011-07-27 11:55:03 +04:00
|
|
|
|
2011-07-27 13:40:58 +04:00
|
|
|
if (alignedToT[i].size() != 1 ||
|
|
|
|
other.alignedToT[i].size() != 1 ||
|
|
|
|
thisAlign != otherAlign)
|
|
|
|
{
|
|
|
|
int ret = (thisAlign < otherAlign) ? -1 : +1;
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2011-07-27 11:55:03 +04:00
|
|
|
return 0;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
2010-07-29 01:28:14 +04:00
|
|
|
|