mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2025-01-08 20:46:59 +03:00
b2d821a141
The duplicate definition works fine in environments where the inline definition becomes a weak symbol in the object file, but if it gets generated as a regular definition, the duplicate definition causes link problems. In most call sites the return value could easily be made const, which gives both the reader and the compiler a bit more certainty about the code's intentions. In theory this may help performance, but it's mainly for clarity. The comments are based on reverse-engineering, and the unit tests are based on the comments. It's possible that some of what's in there is not essential, in which case, don't feel bad about changing it! I left a third identical definition in place, though I updated it with my changes to avoid creeping divergence, and noted the duplication in a comment. It would be nice to get rid of this definition as well, but it'd introduce headers from the main Moses tree into biconcor, which may be against policy.
145 lines
4.5 KiB
C++
145 lines
4.5 KiB
C++
/***********************************************************************
|
|
Moses - factored phrase-based language decoder
|
|
Copyright (C) 2010 University of Edinburgh
|
|
|
|
This library is free software; you can redistribute it and/or
|
|
modify it under the terms of the GNU Lesser General Public
|
|
License as published by the Free Software Foundation; either
|
|
version 2.1 of the License, or (at your option) any later version.
|
|
|
|
This library is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
Lesser General Public License for more details.
|
|
|
|
You should have received a copy of the GNU Lesser General Public
|
|
License along with this library; if not, write to the Free Software
|
|
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
***********************************************************************/
|
|
|
|
#include "SentenceAlignment.h"
|
|
|
|
#include <map>
|
|
#include <set>
|
|
#include <string>
|
|
|
|
#include "tables-core.h"
|
|
#include "util/tokenize.hh"
|
|
|
|
using namespace std;
|
|
|
|
namespace MosesTraining
|
|
{
|
|
|
|
SentenceAlignment::~SentenceAlignment() {}
|
|
|
|
void addBoundaryWords(vector<string> &phrase)
|
|
{
|
|
phrase.insert(phrase.begin(), "<s>");
|
|
phrase.push_back("</s>");
|
|
}
|
|
|
|
bool SentenceAlignment::processTargetSentence(const char * targetString, int, bool boundaryRules)
|
|
{
|
|
target = util::tokenize(targetString);
|
|
if (boundaryRules)
|
|
addBoundaryWords(target);
|
|
return true;
|
|
}
|
|
|
|
bool SentenceAlignment::processSourceSentence(const char * sourceString, int, bool boundaryRules)
|
|
{
|
|
source = util::tokenize(sourceString);
|
|
if (boundaryRules)
|
|
addBoundaryWords(source);
|
|
return true;
|
|
}
|
|
|
|
bool SentenceAlignment::create(const char targetString[],
|
|
const char sourceString[],
|
|
const char alignmentString[],
|
|
const char weightString[],
|
|
int sentenceID, bool boundaryRules)
|
|
{
|
|
using namespace std;
|
|
this->sentenceID = sentenceID;
|
|
this->weightString = std::string(weightString);
|
|
|
|
// process sentence strings and store in target and source members.
|
|
if (!processTargetSentence(targetString, sentenceID, boundaryRules)) {
|
|
return false;
|
|
}
|
|
if (!processSourceSentence(sourceString, sentenceID, boundaryRules)) {
|
|
return false;
|
|
}
|
|
|
|
// check if sentences are empty
|
|
if (target.size() == 0 || source.size() == 0) {
|
|
cerr << "no target (" << target.size() << ") or source (" << source.size() << ") words << end insentence " << sentenceID << endl;
|
|
cerr << "T: " << targetString << endl << "S: " << sourceString << endl;
|
|
return false;
|
|
}
|
|
|
|
// prepare data structures for alignments
|
|
for(size_t i=0; i<source.size(); i++) {
|
|
alignedCountS.push_back( 0 );
|
|
}
|
|
for(size_t i=0; i<target.size(); i++) {
|
|
vector< int > dummy;
|
|
alignedToT.push_back( dummy );
|
|
}
|
|
|
|
// reading in alignments
|
|
vector<string> alignmentSequence = util::tokenize( alignmentString );
|
|
for(size_t i=0; i<alignmentSequence.size(); i++) {
|
|
int s,t;
|
|
// cout << "scaning " << alignmentSequence[i].c_str() << endl;
|
|
if (! sscanf(alignmentSequence[i].c_str(), "%d-%d", &s, &t)) {
|
|
cerr << "WARNING: " << alignmentSequence[i] << " is a bad alignment point in sentence " << sentenceID << endl;
|
|
cerr << "T: " << targetString << endl << "S: " << sourceString << endl;
|
|
return false;
|
|
}
|
|
|
|
if (boundaryRules) {
|
|
++s;
|
|
++t;
|
|
}
|
|
|
|
// cout << "alignmentSequence[i] " << alignmentSequence[i] << " is " << s << ", " << t << endl;
|
|
if ((size_t)t >= target.size() || (size_t)s >= source.size()) {
|
|
cerr << "WARNING: sentence " << sentenceID << " has alignment point (" << s << ", " << t << ") out of bounds (" << source.size() << ", " << target.size() << ")\n";
|
|
cerr << "T: " << targetString << endl << "S: " << sourceString << endl;
|
|
return false;
|
|
}
|
|
alignedToT[t].push_back( s );
|
|
alignedCountS[s]++;
|
|
}
|
|
|
|
if (boundaryRules) {
|
|
alignedToT[0].push_back(0);
|
|
alignedCountS[0]++;
|
|
|
|
alignedToT.back().push_back(alignedCountS.size() - 1);
|
|
alignedCountS.back()++;
|
|
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
void SentenceAlignment::invertAlignment()
|
|
{
|
|
alignedToS.resize(source.size());
|
|
for (size_t targetPos = 0; targetPos < alignedToT.size(); ++targetPos) {
|
|
const std::vector<int> &vec = alignedToT[targetPos];
|
|
for (size_t i = 0; i < vec.size(); ++i) {
|
|
int sourcePos = vec[i];
|
|
alignedToS[sourcePos].push_back(targetPos);
|
|
}
|
|
|
|
}
|
|
}
|
|
|
|
}
|
|
|