mosesdecoder/phrase-extract/SentenceAlignment.cpp
Jeroen Vermeulen b2d821a141 Unify tokenize() into util, and unit-test it.
The duplicate definition works fine in environments where the inline
definition becomes a weak symbol in the object file, but if it gets
generated as a regular definition, the duplicate definition causes link
problems.

In most call sites the return value could easily be made const, which
gives both the reader and the compiler a bit more certainty about the code's
intentions.  In theory this may help performance, but it's mainly for clarity.

The comments are based on reverse-engineering, and the unit tests are based
on the comments.  It's possible that some of what's in there is not essential,
in which case, don't feel bad about changing it!

I left a third identical definition in place, though I updated it with my
changes to avoid creeping divergence, and noted the duplication in a comment.
It would be nice to get rid of this definition as well, but it'd introduce
headers from the main Moses tree into biconcor, which may be against policy.
2015-04-22 09:59:05 +07:00

145 lines
4.5 KiB
C++

/***********************************************************************
Moses - factored phrase-based language decoder
Copyright (C) 2010 University of Edinburgh
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#include "SentenceAlignment.h"
#include <map>
#include <set>
#include <string>
#include "tables-core.h"
#include "util/tokenize.hh"
using namespace std;
namespace MosesTraining
{
SentenceAlignment::~SentenceAlignment() {}
void addBoundaryWords(vector<string> &phrase)
{
phrase.insert(phrase.begin(), "<s>");
phrase.push_back("</s>");
}
bool SentenceAlignment::processTargetSentence(const char * targetString, int, bool boundaryRules)
{
target = util::tokenize(targetString);
if (boundaryRules)
addBoundaryWords(target);
return true;
}
bool SentenceAlignment::processSourceSentence(const char * sourceString, int, bool boundaryRules)
{
source = util::tokenize(sourceString);
if (boundaryRules)
addBoundaryWords(source);
return true;
}
bool SentenceAlignment::create(const char targetString[],
const char sourceString[],
const char alignmentString[],
const char weightString[],
int sentenceID, bool boundaryRules)
{
using namespace std;
this->sentenceID = sentenceID;
this->weightString = std::string(weightString);
// process sentence strings and store in target and source members.
if (!processTargetSentence(targetString, sentenceID, boundaryRules)) {
return false;
}
if (!processSourceSentence(sourceString, sentenceID, boundaryRules)) {
return false;
}
// check if sentences are empty
if (target.size() == 0 || source.size() == 0) {
cerr << "no target (" << target.size() << ") or source (" << source.size() << ") words << end insentence " << sentenceID << endl;
cerr << "T: " << targetString << endl << "S: " << sourceString << endl;
return false;
}
// prepare data structures for alignments
for(size_t i=0; i<source.size(); i++) {
alignedCountS.push_back( 0 );
}
for(size_t i=0; i<target.size(); i++) {
vector< int > dummy;
alignedToT.push_back( dummy );
}
// reading in alignments
vector<string> alignmentSequence = util::tokenize( alignmentString );
for(size_t i=0; i<alignmentSequence.size(); i++) {
int s,t;
// cout << "scaning " << alignmentSequence[i].c_str() << endl;
if (! sscanf(alignmentSequence[i].c_str(), "%d-%d", &s, &t)) {
cerr << "WARNING: " << alignmentSequence[i] << " is a bad alignment point in sentence " << sentenceID << endl;
cerr << "T: " << targetString << endl << "S: " << sourceString << endl;
return false;
}
if (boundaryRules) {
++s;
++t;
}
// cout << "alignmentSequence[i] " << alignmentSequence[i] << " is " << s << ", " << t << endl;
if ((size_t)t >= target.size() || (size_t)s >= source.size()) {
cerr << "WARNING: sentence " << sentenceID << " has alignment point (" << s << ", " << t << ") out of bounds (" << source.size() << ", " << target.size() << ")\n";
cerr << "T: " << targetString << endl << "S: " << sourceString << endl;
return false;
}
alignedToT[t].push_back( s );
alignedCountS[s]++;
}
if (boundaryRules) {
alignedToT[0].push_back(0);
alignedCountS[0]++;
alignedToT.back().push_back(alignedCountS.size() - 1);
alignedCountS.back()++;
}
return true;
}
void SentenceAlignment::invertAlignment()
{
alignedToS.resize(source.size());
for (size_t targetPos = 0; targetPos < alignedToT.size(); ++targetPos) {
const std::vector<int> &vec = alignedToT[targetPos];
for (size_t i = 0; i < vec.size(); ++i) {
int sourcePos = vec[i];
alignedToS[sourcePos].push_back(targetPos);
}
}
}
}