2010-04-13 19:34:39 +04:00
|
|
|
/***********************************************************************
|
|
|
|
Moses - factored phrase-based language decoder
|
|
|
|
Copyright (C) 2010 University of Edinburgh
|
|
|
|
|
|
|
|
This library is free software; you can redistribute it and/or
|
|
|
|
modify it under the terms of the GNU Lesser General Public
|
|
|
|
License as published by the Free Software Foundation; either
|
|
|
|
version 2.1 of the License, or (at your option) any later version.
|
|
|
|
|
|
|
|
This library is distributed in the hope that it will be useful,
|
|
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
|
|
Lesser General Public License for more details.
|
|
|
|
|
|
|
|
You should have received a copy of the GNU Lesser General Public
|
|
|
|
License along with this library; if not, write to the Free Software
|
|
|
|
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
|
|
***********************************************************************/
|
|
|
|
|
2010-04-12 19:22:50 +04:00
|
|
|
#include "SentenceAlignment.h"
|
2010-04-13 19:34:39 +04:00
|
|
|
|
|
|
|
#include <map>
|
|
|
|
#include <set>
|
|
|
|
#include <string>
|
|
|
|
|
2010-04-12 19:22:50 +04:00
|
|
|
#include "tables-core.h"
|
|
|
|
|
2010-06-29 14:41:42 +04:00
|
|
|
bool SentenceAlignment::processTargetSentence(const char * targetString, int)
|
2010-04-13 19:34:39 +04:00
|
|
|
{
|
2011-02-24 16:57:11 +03:00
|
|
|
target = tokenize(targetString);
|
|
|
|
return true;
|
2010-04-13 19:34:39 +04:00
|
|
|
}
|
2010-04-12 19:22:50 +04:00
|
|
|
|
2010-06-29 14:41:42 +04:00
|
|
|
bool SentenceAlignment::processSourceSentence(const char * sourceString, int)
|
2010-04-13 19:34:39 +04:00
|
|
|
{
|
2011-02-24 16:57:11 +03:00
|
|
|
source = tokenize(sourceString);
|
|
|
|
return true;
|
2010-04-13 19:34:39 +04:00
|
|
|
}
|
|
|
|
|
2011-02-24 16:57:11 +03:00
|
|
|
bool SentenceAlignment::create( char targetString[], char sourceString[], char alignmentString[], int sentenceID)
|
|
|
|
{
|
|
|
|
using namespace std;
|
2010-04-13 19:34:39 +04:00
|
|
|
|
2011-02-24 16:57:11 +03:00
|
|
|
// process sentence strings and store in target and source members.
|
|
|
|
if (!processTargetSentence(targetString, sentenceID)) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
if (!processSourceSentence(sourceString, sentenceID)) {
|
|
|
|
return false;
|
|
|
|
}
|
2010-04-12 19:22:50 +04:00
|
|
|
|
|
|
|
// check if sentences are empty
|
|
|
|
if (target.size() == 0 || source.size() == 0) {
|
|
|
|
cerr << "no target (" << target.size() << ") or source (" << source.size() << ") words << end insentence " << sentenceID << endl;
|
|
|
|
cerr << "T: " << targetString << endl << "S: " << sourceString << endl;
|
2010-04-13 19:34:39 +04:00
|
|
|
return false;
|
2010-04-12 19:22:50 +04:00
|
|
|
}
|
2011-02-24 16:57:11 +03:00
|
|
|
|
2010-04-12 19:22:50 +04:00
|
|
|
// prepare data structures for alignments
|
|
|
|
for(int i=0; i<source.size(); i++) {
|
|
|
|
alignedCountS.push_back( 0 );
|
|
|
|
}
|
|
|
|
for(int i=0; i<target.size(); i++) {
|
|
|
|
vector< int > dummy;
|
|
|
|
alignedToT.push_back( dummy );
|
|
|
|
}
|
2011-02-24 16:57:11 +03:00
|
|
|
|
2010-04-12 19:22:50 +04:00
|
|
|
// reading in alignments
|
|
|
|
vector<string> alignmentSequence = tokenize( alignmentString );
|
|
|
|
for(int i=0; i<alignmentSequence.size(); i++) {
|
|
|
|
int s,t;
|
|
|
|
// cout << "scaning " << alignmentSequence[i].c_str() << endl;
|
|
|
|
if (! sscanf(alignmentSequence[i].c_str(), "%d-%d", &s, &t)) {
|
2011-02-24 16:57:11 +03:00
|
|
|
cerr << "WARNING: " << alignmentSequence[i] << " is a bad alignment point in sentence " << sentenceID << endl;
|
2010-04-12 19:22:50 +04:00
|
|
|
cerr << "T: " << targetString << endl << "S: " << sourceString << endl;
|
2010-04-13 19:34:39 +04:00
|
|
|
return false;
|
2010-04-12 19:22:50 +04:00
|
|
|
}
|
2011-02-24 16:57:11 +03:00
|
|
|
// cout << "alignmentSequence[i] " << alignmentSequence[i] << " is " << s << ", " << t << endl;
|
|
|
|
if (t >= target.size() || s >= source.size()) {
|
2010-04-12 19:22:50 +04:00
|
|
|
cerr << "WARNING: sentence " << sentenceID << " has alignment point (" << s << ", " << t << ") out of bounds (" << source.size() << ", " << target.size() << ")\n";
|
|
|
|
cerr << "T: " << targetString << endl << "S: " << sourceString << endl;
|
2010-04-13 19:34:39 +04:00
|
|
|
return false;
|
2010-04-12 19:22:50 +04:00
|
|
|
}
|
|
|
|
alignedToT[t].push_back( s );
|
|
|
|
alignedCountS[s]++;
|
|
|
|
}
|
2010-04-13 19:34:39 +04:00
|
|
|
return true;
|
2010-04-12 19:22:50 +04:00
|
|
|
}
|